Uploaded checkpoint-20000
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +3511 -3
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2692969128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bcfce4195a15d1c67172a7aefe67af94867820d68efa2e46ab93cd2b5fa134b
|
3 |
size 2692969128
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5386075202
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc261511118721b030d66cbd19ed9d28203a26f346b752eaad9f2d558bde7468
|
3 |
size 5386075202
|
rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15a3a2ff59d530809f33e75e185c52fdceb84e6eba7c55faa5cf42d910644089
|
3 |
size 14244
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7dc694a733ff91b79c5eaf7bcfe8aa41771c4ef8a47d325d2a9e9f6bc78f946
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": 1.3379485607147217,
|
3 |
"best_model_checkpoint": "runs/deepseek_20240422-210351/checkpoint-15000",
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 5000,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -10531,6 +10531,3514 @@
|
|
10531 |
"eval_samples_per_second": 16.862,
|
10532 |
"eval_steps_per_second": 16.862,
|
10533 |
"step": 15000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10534 |
}
|
10535 |
],
|
10536 |
"logging_steps": 10,
|
@@ -10538,7 +14046,7 @@
|
|
10538 |
"num_input_tokens_seen": 0,
|
10539 |
"num_train_epochs": 1,
|
10540 |
"save_steps": 5000,
|
10541 |
-
"total_flos":
|
10542 |
"train_batch_size": 1,
|
10543 |
"trial_name": null,
|
10544 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": 1.3379485607147217,
|
3 |
"best_model_checkpoint": "runs/deepseek_20240422-210351/checkpoint-15000",
|
4 |
+
"epoch": 0.5,
|
5 |
"eval_steps": 5000,
|
6 |
+
"global_step": 20000,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
10531 |
"eval_samples_per_second": 16.862,
|
10532 |
"eval_steps_per_second": 16.862,
|
10533 |
"step": 15000
|
10534 |
+
},
|
10535 |
+
{
|
10536 |
+
"epoch": 0.38,
|
10537 |
+
"grad_norm": 9.0625,
|
10538 |
+
"learning_rate": 1.016271186440678e-05,
|
10539 |
+
"loss": 1.4647,
|
10540 |
+
"step": 15010
|
10541 |
+
},
|
10542 |
+
{
|
10543 |
+
"epoch": 0.38,
|
10544 |
+
"grad_norm": 61.0,
|
10545 |
+
"learning_rate": 1.015593220338983e-05,
|
10546 |
+
"loss": 1.2361,
|
10547 |
+
"step": 15020
|
10548 |
+
},
|
10549 |
+
{
|
10550 |
+
"epoch": 0.38,
|
10551 |
+
"grad_norm": 23.625,
|
10552 |
+
"learning_rate": 1.0149152542372882e-05,
|
10553 |
+
"loss": 1.2726,
|
10554 |
+
"step": 15030
|
10555 |
+
},
|
10556 |
+
{
|
10557 |
+
"epoch": 0.38,
|
10558 |
+
"grad_norm": 19.375,
|
10559 |
+
"learning_rate": 1.0142372881355933e-05,
|
10560 |
+
"loss": 1.3574,
|
10561 |
+
"step": 15040
|
10562 |
+
},
|
10563 |
+
{
|
10564 |
+
"epoch": 0.38,
|
10565 |
+
"grad_norm": 49.0,
|
10566 |
+
"learning_rate": 1.0135593220338985e-05,
|
10567 |
+
"loss": 1.3779,
|
10568 |
+
"step": 15050
|
10569 |
+
},
|
10570 |
+
{
|
10571 |
+
"epoch": 0.38,
|
10572 |
+
"grad_norm": 41.5,
|
10573 |
+
"learning_rate": 1.0128813559322034e-05,
|
10574 |
+
"loss": 1.1373,
|
10575 |
+
"step": 15060
|
10576 |
+
},
|
10577 |
+
{
|
10578 |
+
"epoch": 0.38,
|
10579 |
+
"grad_norm": 15.1875,
|
10580 |
+
"learning_rate": 1.0122033898305086e-05,
|
10581 |
+
"loss": 1.5297,
|
10582 |
+
"step": 15070
|
10583 |
+
},
|
10584 |
+
{
|
10585 |
+
"epoch": 0.38,
|
10586 |
+
"grad_norm": 27.375,
|
10587 |
+
"learning_rate": 1.0115254237288137e-05,
|
10588 |
+
"loss": 1.2211,
|
10589 |
+
"step": 15080
|
10590 |
+
},
|
10591 |
+
{
|
10592 |
+
"epoch": 0.38,
|
10593 |
+
"grad_norm": 10.0625,
|
10594 |
+
"learning_rate": 1.0108474576271189e-05,
|
10595 |
+
"loss": 1.2089,
|
10596 |
+
"step": 15090
|
10597 |
+
},
|
10598 |
+
{
|
10599 |
+
"epoch": 0.38,
|
10600 |
+
"grad_norm": 23.125,
|
10601 |
+
"learning_rate": 1.0101694915254238e-05,
|
10602 |
+
"loss": 1.3034,
|
10603 |
+
"step": 15100
|
10604 |
+
},
|
10605 |
+
{
|
10606 |
+
"epoch": 0.38,
|
10607 |
+
"grad_norm": 21.375,
|
10608 |
+
"learning_rate": 1.009491525423729e-05,
|
10609 |
+
"loss": 1.3993,
|
10610 |
+
"step": 15110
|
10611 |
+
},
|
10612 |
+
{
|
10613 |
+
"epoch": 0.38,
|
10614 |
+
"grad_norm": 25.625,
|
10615 |
+
"learning_rate": 1.0088135593220341e-05,
|
10616 |
+
"loss": 1.3298,
|
10617 |
+
"step": 15120
|
10618 |
+
},
|
10619 |
+
{
|
10620 |
+
"epoch": 0.38,
|
10621 |
+
"grad_norm": 20.5,
|
10622 |
+
"learning_rate": 1.008135593220339e-05,
|
10623 |
+
"loss": 1.4189,
|
10624 |
+
"step": 15130
|
10625 |
+
},
|
10626 |
+
{
|
10627 |
+
"epoch": 0.38,
|
10628 |
+
"grad_norm": 35.75,
|
10629 |
+
"learning_rate": 1.0074576271186442e-05,
|
10630 |
+
"loss": 1.415,
|
10631 |
+
"step": 15140
|
10632 |
+
},
|
10633 |
+
{
|
10634 |
+
"epoch": 0.38,
|
10635 |
+
"grad_norm": 17.625,
|
10636 |
+
"learning_rate": 1.0067796610169492e-05,
|
10637 |
+
"loss": 1.3125,
|
10638 |
+
"step": 15150
|
10639 |
+
},
|
10640 |
+
{
|
10641 |
+
"epoch": 0.38,
|
10642 |
+
"grad_norm": 10.125,
|
10643 |
+
"learning_rate": 1.0061016949152542e-05,
|
10644 |
+
"loss": 1.2626,
|
10645 |
+
"step": 15160
|
10646 |
+
},
|
10647 |
+
{
|
10648 |
+
"epoch": 0.38,
|
10649 |
+
"grad_norm": 23.0,
|
10650 |
+
"learning_rate": 1.0054237288135593e-05,
|
10651 |
+
"loss": 1.2828,
|
10652 |
+
"step": 15170
|
10653 |
+
},
|
10654 |
+
{
|
10655 |
+
"epoch": 0.38,
|
10656 |
+
"grad_norm": 14.75,
|
10657 |
+
"learning_rate": 1.0047457627118644e-05,
|
10658 |
+
"loss": 1.358,
|
10659 |
+
"step": 15180
|
10660 |
+
},
|
10661 |
+
{
|
10662 |
+
"epoch": 0.38,
|
10663 |
+
"grad_norm": 22.75,
|
10664 |
+
"learning_rate": 1.0040677966101696e-05,
|
10665 |
+
"loss": 1.2964,
|
10666 |
+
"step": 15190
|
10667 |
+
},
|
10668 |
+
{
|
10669 |
+
"epoch": 0.38,
|
10670 |
+
"grad_norm": 14.0,
|
10671 |
+
"learning_rate": 1.0033898305084746e-05,
|
10672 |
+
"loss": 1.4093,
|
10673 |
+
"step": 15200
|
10674 |
+
},
|
10675 |
+
{
|
10676 |
+
"epoch": 0.38,
|
10677 |
+
"grad_norm": 58.25,
|
10678 |
+
"learning_rate": 1.0027118644067797e-05,
|
10679 |
+
"loss": 1.3956,
|
10680 |
+
"step": 15210
|
10681 |
+
},
|
10682 |
+
{
|
10683 |
+
"epoch": 0.38,
|
10684 |
+
"grad_norm": 27.25,
|
10685 |
+
"learning_rate": 1.0020338983050848e-05,
|
10686 |
+
"loss": 1.3254,
|
10687 |
+
"step": 15220
|
10688 |
+
},
|
10689 |
+
{
|
10690 |
+
"epoch": 0.38,
|
10691 |
+
"grad_norm": 12.75,
|
10692 |
+
"learning_rate": 1.00135593220339e-05,
|
10693 |
+
"loss": 1.4936,
|
10694 |
+
"step": 15230
|
10695 |
+
},
|
10696 |
+
{
|
10697 |
+
"epoch": 0.38,
|
10698 |
+
"grad_norm": 13.25,
|
10699 |
+
"learning_rate": 1.000677966101695e-05,
|
10700 |
+
"loss": 1.357,
|
10701 |
+
"step": 15240
|
10702 |
+
},
|
10703 |
+
{
|
10704 |
+
"epoch": 0.38,
|
10705 |
+
"grad_norm": 51.25,
|
10706 |
+
"learning_rate": 1e-05,
|
10707 |
+
"loss": 1.3777,
|
10708 |
+
"step": 15250
|
10709 |
+
},
|
10710 |
+
{
|
10711 |
+
"epoch": 0.38,
|
10712 |
+
"grad_norm": 33.25,
|
10713 |
+
"learning_rate": 9.993220338983052e-06,
|
10714 |
+
"loss": 1.3847,
|
10715 |
+
"step": 15260
|
10716 |
+
},
|
10717 |
+
{
|
10718 |
+
"epoch": 0.38,
|
10719 |
+
"grad_norm": 32.75,
|
10720 |
+
"learning_rate": 9.986440677966102e-06,
|
10721 |
+
"loss": 1.4158,
|
10722 |
+
"step": 15270
|
10723 |
+
},
|
10724 |
+
{
|
10725 |
+
"epoch": 0.38,
|
10726 |
+
"grad_norm": 16.75,
|
10727 |
+
"learning_rate": 9.979661016949153e-06,
|
10728 |
+
"loss": 1.3884,
|
10729 |
+
"step": 15280
|
10730 |
+
},
|
10731 |
+
{
|
10732 |
+
"epoch": 0.38,
|
10733 |
+
"grad_norm": 35.25,
|
10734 |
+
"learning_rate": 9.972881355932205e-06,
|
10735 |
+
"loss": 1.3907,
|
10736 |
+
"step": 15290
|
10737 |
+
},
|
10738 |
+
{
|
10739 |
+
"epoch": 0.38,
|
10740 |
+
"grad_norm": 60.5,
|
10741 |
+
"learning_rate": 9.966101694915256e-06,
|
10742 |
+
"loss": 1.5153,
|
10743 |
+
"step": 15300
|
10744 |
+
},
|
10745 |
+
{
|
10746 |
+
"epoch": 0.38,
|
10747 |
+
"grad_norm": 53.5,
|
10748 |
+
"learning_rate": 9.959322033898306e-06,
|
10749 |
+
"loss": 1.4849,
|
10750 |
+
"step": 15310
|
10751 |
+
},
|
10752 |
+
{
|
10753 |
+
"epoch": 0.38,
|
10754 |
+
"grad_norm": 17.875,
|
10755 |
+
"learning_rate": 9.952542372881356e-06,
|
10756 |
+
"loss": 1.4594,
|
10757 |
+
"step": 15320
|
10758 |
+
},
|
10759 |
+
{
|
10760 |
+
"epoch": 0.38,
|
10761 |
+
"grad_norm": 10.25,
|
10762 |
+
"learning_rate": 9.945762711864407e-06,
|
10763 |
+
"loss": 1.4785,
|
10764 |
+
"step": 15330
|
10765 |
+
},
|
10766 |
+
{
|
10767 |
+
"epoch": 0.38,
|
10768 |
+
"grad_norm": 19.75,
|
10769 |
+
"learning_rate": 9.938983050847458e-06,
|
10770 |
+
"loss": 1.3351,
|
10771 |
+
"step": 15340
|
10772 |
+
},
|
10773 |
+
{
|
10774 |
+
"epoch": 0.38,
|
10775 |
+
"grad_norm": 10.875,
|
10776 |
+
"learning_rate": 9.93220338983051e-06,
|
10777 |
+
"loss": 1.3255,
|
10778 |
+
"step": 15350
|
10779 |
+
},
|
10780 |
+
{
|
10781 |
+
"epoch": 0.38,
|
10782 |
+
"grad_norm": 31.0,
|
10783 |
+
"learning_rate": 9.92542372881356e-06,
|
10784 |
+
"loss": 1.2516,
|
10785 |
+
"step": 15360
|
10786 |
+
},
|
10787 |
+
{
|
10788 |
+
"epoch": 0.38,
|
10789 |
+
"grad_norm": 23.75,
|
10790 |
+
"learning_rate": 9.918644067796611e-06,
|
10791 |
+
"loss": 1.3803,
|
10792 |
+
"step": 15370
|
10793 |
+
},
|
10794 |
+
{
|
10795 |
+
"epoch": 0.38,
|
10796 |
+
"grad_norm": 13.875,
|
10797 |
+
"learning_rate": 9.911864406779662e-06,
|
10798 |
+
"loss": 1.5092,
|
10799 |
+
"step": 15380
|
10800 |
+
},
|
10801 |
+
{
|
10802 |
+
"epoch": 0.38,
|
10803 |
+
"grad_norm": 14.25,
|
10804 |
+
"learning_rate": 9.905084745762714e-06,
|
10805 |
+
"loss": 1.2757,
|
10806 |
+
"step": 15390
|
10807 |
+
},
|
10808 |
+
{
|
10809 |
+
"epoch": 0.39,
|
10810 |
+
"grad_norm": 14.0625,
|
10811 |
+
"learning_rate": 9.898305084745763e-06,
|
10812 |
+
"loss": 1.44,
|
10813 |
+
"step": 15400
|
10814 |
+
},
|
10815 |
+
{
|
10816 |
+
"epoch": 0.39,
|
10817 |
+
"grad_norm": 35.0,
|
10818 |
+
"learning_rate": 9.891525423728813e-06,
|
10819 |
+
"loss": 1.2554,
|
10820 |
+
"step": 15410
|
10821 |
+
},
|
10822 |
+
{
|
10823 |
+
"epoch": 0.39,
|
10824 |
+
"grad_norm": 13.0625,
|
10825 |
+
"learning_rate": 9.884745762711864e-06,
|
10826 |
+
"loss": 1.3457,
|
10827 |
+
"step": 15420
|
10828 |
+
},
|
10829 |
+
{
|
10830 |
+
"epoch": 0.39,
|
10831 |
+
"grad_norm": 55.75,
|
10832 |
+
"learning_rate": 9.877966101694916e-06,
|
10833 |
+
"loss": 1.3047,
|
10834 |
+
"step": 15430
|
10835 |
+
},
|
10836 |
+
{
|
10837 |
+
"epoch": 0.39,
|
10838 |
+
"grad_norm": 22.625,
|
10839 |
+
"learning_rate": 9.871186440677967e-06,
|
10840 |
+
"loss": 1.3591,
|
10841 |
+
"step": 15440
|
10842 |
+
},
|
10843 |
+
{
|
10844 |
+
"epoch": 0.39,
|
10845 |
+
"grad_norm": 22.5,
|
10846 |
+
"learning_rate": 9.864406779661017e-06,
|
10847 |
+
"loss": 1.353,
|
10848 |
+
"step": 15450
|
10849 |
+
},
|
10850 |
+
{
|
10851 |
+
"epoch": 0.39,
|
10852 |
+
"grad_norm": 26.75,
|
10853 |
+
"learning_rate": 9.857627118644068e-06,
|
10854 |
+
"loss": 1.3824,
|
10855 |
+
"step": 15460
|
10856 |
+
},
|
10857 |
+
{
|
10858 |
+
"epoch": 0.39,
|
10859 |
+
"grad_norm": 20.75,
|
10860 |
+
"learning_rate": 9.85084745762712e-06,
|
10861 |
+
"loss": 1.5898,
|
10862 |
+
"step": 15470
|
10863 |
+
},
|
10864 |
+
{
|
10865 |
+
"epoch": 0.39,
|
10866 |
+
"grad_norm": 9.25,
|
10867 |
+
"learning_rate": 9.844067796610171e-06,
|
10868 |
+
"loss": 1.4088,
|
10869 |
+
"step": 15480
|
10870 |
+
},
|
10871 |
+
{
|
10872 |
+
"epoch": 0.39,
|
10873 |
+
"grad_norm": 31.875,
|
10874 |
+
"learning_rate": 9.837288135593221e-06,
|
10875 |
+
"loss": 1.3672,
|
10876 |
+
"step": 15490
|
10877 |
+
},
|
10878 |
+
{
|
10879 |
+
"epoch": 0.39,
|
10880 |
+
"grad_norm": 24.0,
|
10881 |
+
"learning_rate": 9.830508474576272e-06,
|
10882 |
+
"loss": 1.364,
|
10883 |
+
"step": 15500
|
10884 |
+
},
|
10885 |
+
{
|
10886 |
+
"epoch": 0.39,
|
10887 |
+
"grad_norm": 32.25,
|
10888 |
+
"learning_rate": 9.823728813559322e-06,
|
10889 |
+
"loss": 1.4248,
|
10890 |
+
"step": 15510
|
10891 |
+
},
|
10892 |
+
{
|
10893 |
+
"epoch": 0.39,
|
10894 |
+
"grad_norm": 26.875,
|
10895 |
+
"learning_rate": 9.816949152542373e-06,
|
10896 |
+
"loss": 1.2727,
|
10897 |
+
"step": 15520
|
10898 |
+
},
|
10899 |
+
{
|
10900 |
+
"epoch": 0.39,
|
10901 |
+
"grad_norm": 16.25,
|
10902 |
+
"learning_rate": 9.810169491525425e-06,
|
10903 |
+
"loss": 1.4068,
|
10904 |
+
"step": 15530
|
10905 |
+
},
|
10906 |
+
{
|
10907 |
+
"epoch": 0.39,
|
10908 |
+
"grad_norm": 33.5,
|
10909 |
+
"learning_rate": 9.803389830508474e-06,
|
10910 |
+
"loss": 1.4151,
|
10911 |
+
"step": 15540
|
10912 |
+
},
|
10913 |
+
{
|
10914 |
+
"epoch": 0.39,
|
10915 |
+
"grad_norm": 13.0625,
|
10916 |
+
"learning_rate": 9.796610169491526e-06,
|
10917 |
+
"loss": 1.278,
|
10918 |
+
"step": 15550
|
10919 |
+
},
|
10920 |
+
{
|
10921 |
+
"epoch": 0.39,
|
10922 |
+
"grad_norm": 14.6875,
|
10923 |
+
"learning_rate": 9.789830508474577e-06,
|
10924 |
+
"loss": 1.2652,
|
10925 |
+
"step": 15560
|
10926 |
+
},
|
10927 |
+
{
|
10928 |
+
"epoch": 0.39,
|
10929 |
+
"grad_norm": 13.5,
|
10930 |
+
"learning_rate": 9.783050847457629e-06,
|
10931 |
+
"loss": 1.397,
|
10932 |
+
"step": 15570
|
10933 |
+
},
|
10934 |
+
{
|
10935 |
+
"epoch": 0.39,
|
10936 |
+
"grad_norm": 12.9375,
|
10937 |
+
"learning_rate": 9.776271186440678e-06,
|
10938 |
+
"loss": 1.2682,
|
10939 |
+
"step": 15580
|
10940 |
+
},
|
10941 |
+
{
|
10942 |
+
"epoch": 0.39,
|
10943 |
+
"grad_norm": 34.0,
|
10944 |
+
"learning_rate": 9.76949152542373e-06,
|
10945 |
+
"loss": 1.152,
|
10946 |
+
"step": 15590
|
10947 |
+
},
|
10948 |
+
{
|
10949 |
+
"epoch": 0.39,
|
10950 |
+
"grad_norm": 81.5,
|
10951 |
+
"learning_rate": 9.762711864406781e-06,
|
10952 |
+
"loss": 1.3513,
|
10953 |
+
"step": 15600
|
10954 |
+
},
|
10955 |
+
{
|
10956 |
+
"epoch": 0.39,
|
10957 |
+
"grad_norm": 18.5,
|
10958 |
+
"learning_rate": 9.755932203389833e-06,
|
10959 |
+
"loss": 1.377,
|
10960 |
+
"step": 15610
|
10961 |
+
},
|
10962 |
+
{
|
10963 |
+
"epoch": 0.39,
|
10964 |
+
"grad_norm": 20.625,
|
10965 |
+
"learning_rate": 9.749152542372882e-06,
|
10966 |
+
"loss": 1.3957,
|
10967 |
+
"step": 15620
|
10968 |
+
},
|
10969 |
+
{
|
10970 |
+
"epoch": 0.39,
|
10971 |
+
"grad_norm": 33.0,
|
10972 |
+
"learning_rate": 9.742372881355932e-06,
|
10973 |
+
"loss": 1.3222,
|
10974 |
+
"step": 15630
|
10975 |
+
},
|
10976 |
+
{
|
10977 |
+
"epoch": 0.39,
|
10978 |
+
"grad_norm": 12.375,
|
10979 |
+
"learning_rate": 9.735593220338983e-06,
|
10980 |
+
"loss": 1.4281,
|
10981 |
+
"step": 15640
|
10982 |
+
},
|
10983 |
+
{
|
10984 |
+
"epoch": 0.39,
|
10985 |
+
"grad_norm": 30.25,
|
10986 |
+
"learning_rate": 9.728813559322035e-06,
|
10987 |
+
"loss": 1.4635,
|
10988 |
+
"step": 15650
|
10989 |
+
},
|
10990 |
+
{
|
10991 |
+
"epoch": 0.39,
|
10992 |
+
"grad_norm": 18.0,
|
10993 |
+
"learning_rate": 9.722033898305086e-06,
|
10994 |
+
"loss": 1.3443,
|
10995 |
+
"step": 15660
|
10996 |
+
},
|
10997 |
+
{
|
10998 |
+
"epoch": 0.39,
|
10999 |
+
"grad_norm": 76.0,
|
11000 |
+
"learning_rate": 9.715254237288136e-06,
|
11001 |
+
"loss": 1.4472,
|
11002 |
+
"step": 15670
|
11003 |
+
},
|
11004 |
+
{
|
11005 |
+
"epoch": 0.39,
|
11006 |
+
"grad_norm": 49.5,
|
11007 |
+
"learning_rate": 9.708474576271187e-06,
|
11008 |
+
"loss": 1.1069,
|
11009 |
+
"step": 15680
|
11010 |
+
},
|
11011 |
+
{
|
11012 |
+
"epoch": 0.39,
|
11013 |
+
"grad_norm": 11.4375,
|
11014 |
+
"learning_rate": 9.701694915254239e-06,
|
11015 |
+
"loss": 1.5717,
|
11016 |
+
"step": 15690
|
11017 |
+
},
|
11018 |
+
{
|
11019 |
+
"epoch": 0.39,
|
11020 |
+
"grad_norm": 49.75,
|
11021 |
+
"learning_rate": 9.69491525423729e-06,
|
11022 |
+
"loss": 1.4191,
|
11023 |
+
"step": 15700
|
11024 |
+
},
|
11025 |
+
{
|
11026 |
+
"epoch": 0.39,
|
11027 |
+
"grad_norm": 19.0,
|
11028 |
+
"learning_rate": 9.68813559322034e-06,
|
11029 |
+
"loss": 1.389,
|
11030 |
+
"step": 15710
|
11031 |
+
},
|
11032 |
+
{
|
11033 |
+
"epoch": 0.39,
|
11034 |
+
"grad_norm": 12.6875,
|
11035 |
+
"learning_rate": 9.68135593220339e-06,
|
11036 |
+
"loss": 1.1881,
|
11037 |
+
"step": 15720
|
11038 |
+
},
|
11039 |
+
{
|
11040 |
+
"epoch": 0.39,
|
11041 |
+
"grad_norm": 28.5,
|
11042 |
+
"learning_rate": 9.674576271186441e-06,
|
11043 |
+
"loss": 1.2252,
|
11044 |
+
"step": 15730
|
11045 |
+
},
|
11046 |
+
{
|
11047 |
+
"epoch": 0.39,
|
11048 |
+
"grad_norm": 34.25,
|
11049 |
+
"learning_rate": 9.667796610169492e-06,
|
11050 |
+
"loss": 1.4493,
|
11051 |
+
"step": 15740
|
11052 |
+
},
|
11053 |
+
{
|
11054 |
+
"epoch": 0.39,
|
11055 |
+
"grad_norm": 21.125,
|
11056 |
+
"learning_rate": 9.661016949152544e-06,
|
11057 |
+
"loss": 1.4645,
|
11058 |
+
"step": 15750
|
11059 |
+
},
|
11060 |
+
{
|
11061 |
+
"epoch": 0.39,
|
11062 |
+
"grad_norm": 13.1875,
|
11063 |
+
"learning_rate": 9.654237288135593e-06,
|
11064 |
+
"loss": 1.4717,
|
11065 |
+
"step": 15760
|
11066 |
+
},
|
11067 |
+
{
|
11068 |
+
"epoch": 0.39,
|
11069 |
+
"grad_norm": 11.0625,
|
11070 |
+
"learning_rate": 9.647457627118645e-06,
|
11071 |
+
"loss": 1.2183,
|
11072 |
+
"step": 15770
|
11073 |
+
},
|
11074 |
+
{
|
11075 |
+
"epoch": 0.39,
|
11076 |
+
"grad_norm": 21.75,
|
11077 |
+
"learning_rate": 9.640677966101696e-06,
|
11078 |
+
"loss": 1.3174,
|
11079 |
+
"step": 15780
|
11080 |
+
},
|
11081 |
+
{
|
11082 |
+
"epoch": 0.39,
|
11083 |
+
"grad_norm": 11.75,
|
11084 |
+
"learning_rate": 9.633898305084746e-06,
|
11085 |
+
"loss": 1.3315,
|
11086 |
+
"step": 15790
|
11087 |
+
},
|
11088 |
+
{
|
11089 |
+
"epoch": 0.4,
|
11090 |
+
"grad_norm": 9.875,
|
11091 |
+
"learning_rate": 9.627118644067797e-06,
|
11092 |
+
"loss": 1.2776,
|
11093 |
+
"step": 15800
|
11094 |
+
},
|
11095 |
+
{
|
11096 |
+
"epoch": 0.4,
|
11097 |
+
"grad_norm": 23.75,
|
11098 |
+
"learning_rate": 9.620338983050849e-06,
|
11099 |
+
"loss": 1.5261,
|
11100 |
+
"step": 15810
|
11101 |
+
},
|
11102 |
+
{
|
11103 |
+
"epoch": 0.4,
|
11104 |
+
"grad_norm": 11.3125,
|
11105 |
+
"learning_rate": 9.6135593220339e-06,
|
11106 |
+
"loss": 1.4339,
|
11107 |
+
"step": 15820
|
11108 |
+
},
|
11109 |
+
{
|
11110 |
+
"epoch": 0.4,
|
11111 |
+
"grad_norm": 35.25,
|
11112 |
+
"learning_rate": 9.60677966101695e-06,
|
11113 |
+
"loss": 1.4765,
|
11114 |
+
"step": 15830
|
11115 |
+
},
|
11116 |
+
{
|
11117 |
+
"epoch": 0.4,
|
11118 |
+
"grad_norm": 108.0,
|
11119 |
+
"learning_rate": 9.600000000000001e-06,
|
11120 |
+
"loss": 1.3046,
|
11121 |
+
"step": 15840
|
11122 |
+
},
|
11123 |
+
{
|
11124 |
+
"epoch": 0.4,
|
11125 |
+
"grad_norm": 43.75,
|
11126 |
+
"learning_rate": 9.593220338983051e-06,
|
11127 |
+
"loss": 1.2919,
|
11128 |
+
"step": 15850
|
11129 |
+
},
|
11130 |
+
{
|
11131 |
+
"epoch": 0.4,
|
11132 |
+
"grad_norm": 29.0,
|
11133 |
+
"learning_rate": 9.586440677966102e-06,
|
11134 |
+
"loss": 1.2598,
|
11135 |
+
"step": 15860
|
11136 |
+
},
|
11137 |
+
{
|
11138 |
+
"epoch": 0.4,
|
11139 |
+
"grad_norm": 13.375,
|
11140 |
+
"learning_rate": 9.579661016949154e-06,
|
11141 |
+
"loss": 1.3753,
|
11142 |
+
"step": 15870
|
11143 |
+
},
|
11144 |
+
{
|
11145 |
+
"epoch": 0.4,
|
11146 |
+
"grad_norm": 26.75,
|
11147 |
+
"learning_rate": 9.572881355932203e-06,
|
11148 |
+
"loss": 1.1029,
|
11149 |
+
"step": 15880
|
11150 |
+
},
|
11151 |
+
{
|
11152 |
+
"epoch": 0.4,
|
11153 |
+
"grad_norm": 25.375,
|
11154 |
+
"learning_rate": 9.566101694915255e-06,
|
11155 |
+
"loss": 1.428,
|
11156 |
+
"step": 15890
|
11157 |
+
},
|
11158 |
+
{
|
11159 |
+
"epoch": 0.4,
|
11160 |
+
"grad_norm": 34.0,
|
11161 |
+
"learning_rate": 9.559322033898306e-06,
|
11162 |
+
"loss": 1.504,
|
11163 |
+
"step": 15900
|
11164 |
+
},
|
11165 |
+
{
|
11166 |
+
"epoch": 0.4,
|
11167 |
+
"grad_norm": 17.5,
|
11168 |
+
"learning_rate": 9.552542372881358e-06,
|
11169 |
+
"loss": 1.2561,
|
11170 |
+
"step": 15910
|
11171 |
+
},
|
11172 |
+
{
|
11173 |
+
"epoch": 0.4,
|
11174 |
+
"grad_norm": 22.125,
|
11175 |
+
"learning_rate": 9.545762711864407e-06,
|
11176 |
+
"loss": 1.4642,
|
11177 |
+
"step": 15920
|
11178 |
+
},
|
11179 |
+
{
|
11180 |
+
"epoch": 0.4,
|
11181 |
+
"grad_norm": 23.75,
|
11182 |
+
"learning_rate": 9.538983050847457e-06,
|
11183 |
+
"loss": 1.4706,
|
11184 |
+
"step": 15930
|
11185 |
+
},
|
11186 |
+
{
|
11187 |
+
"epoch": 0.4,
|
11188 |
+
"grad_norm": 25.625,
|
11189 |
+
"learning_rate": 9.532203389830508e-06,
|
11190 |
+
"loss": 1.4855,
|
11191 |
+
"step": 15940
|
11192 |
+
},
|
11193 |
+
{
|
11194 |
+
"epoch": 0.4,
|
11195 |
+
"grad_norm": 52.0,
|
11196 |
+
"learning_rate": 9.52542372881356e-06,
|
11197 |
+
"loss": 1.3423,
|
11198 |
+
"step": 15950
|
11199 |
+
},
|
11200 |
+
{
|
11201 |
+
"epoch": 0.4,
|
11202 |
+
"grad_norm": 16.875,
|
11203 |
+
"learning_rate": 9.518644067796611e-06,
|
11204 |
+
"loss": 1.4521,
|
11205 |
+
"step": 15960
|
11206 |
+
},
|
11207 |
+
{
|
11208 |
+
"epoch": 0.4,
|
11209 |
+
"grad_norm": 21.375,
|
11210 |
+
"learning_rate": 9.511864406779661e-06,
|
11211 |
+
"loss": 1.3876,
|
11212 |
+
"step": 15970
|
11213 |
+
},
|
11214 |
+
{
|
11215 |
+
"epoch": 0.4,
|
11216 |
+
"grad_norm": 26.75,
|
11217 |
+
"learning_rate": 9.505084745762712e-06,
|
11218 |
+
"loss": 1.3349,
|
11219 |
+
"step": 15980
|
11220 |
+
},
|
11221 |
+
{
|
11222 |
+
"epoch": 0.4,
|
11223 |
+
"grad_norm": 25.0,
|
11224 |
+
"learning_rate": 9.498305084745764e-06,
|
11225 |
+
"loss": 1.321,
|
11226 |
+
"step": 15990
|
11227 |
+
},
|
11228 |
+
{
|
11229 |
+
"epoch": 0.4,
|
11230 |
+
"grad_norm": 30.25,
|
11231 |
+
"learning_rate": 9.491525423728815e-06,
|
11232 |
+
"loss": 1.4817,
|
11233 |
+
"step": 16000
|
11234 |
+
},
|
11235 |
+
{
|
11236 |
+
"epoch": 0.4,
|
11237 |
+
"grad_norm": 23.5,
|
11238 |
+
"learning_rate": 9.484745762711865e-06,
|
11239 |
+
"loss": 1.4348,
|
11240 |
+
"step": 16010
|
11241 |
+
},
|
11242 |
+
{
|
11243 |
+
"epoch": 0.4,
|
11244 |
+
"grad_norm": 6.59375,
|
11245 |
+
"learning_rate": 9.477966101694916e-06,
|
11246 |
+
"loss": 1.3662,
|
11247 |
+
"step": 16020
|
11248 |
+
},
|
11249 |
+
{
|
11250 |
+
"epoch": 0.4,
|
11251 |
+
"grad_norm": 18.75,
|
11252 |
+
"learning_rate": 9.471186440677966e-06,
|
11253 |
+
"loss": 1.1267,
|
11254 |
+
"step": 16030
|
11255 |
+
},
|
11256 |
+
{
|
11257 |
+
"epoch": 0.4,
|
11258 |
+
"grad_norm": 16.125,
|
11259 |
+
"learning_rate": 9.464406779661017e-06,
|
11260 |
+
"loss": 1.3971,
|
11261 |
+
"step": 16040
|
11262 |
+
},
|
11263 |
+
{
|
11264 |
+
"epoch": 0.4,
|
11265 |
+
"grad_norm": 16.125,
|
11266 |
+
"learning_rate": 9.457627118644069e-06,
|
11267 |
+
"loss": 1.2916,
|
11268 |
+
"step": 16050
|
11269 |
+
},
|
11270 |
+
{
|
11271 |
+
"epoch": 0.4,
|
11272 |
+
"grad_norm": 24.25,
|
11273 |
+
"learning_rate": 9.450847457627119e-06,
|
11274 |
+
"loss": 1.3208,
|
11275 |
+
"step": 16060
|
11276 |
+
},
|
11277 |
+
{
|
11278 |
+
"epoch": 0.4,
|
11279 |
+
"grad_norm": 11.625,
|
11280 |
+
"learning_rate": 9.44406779661017e-06,
|
11281 |
+
"loss": 1.2998,
|
11282 |
+
"step": 16070
|
11283 |
+
},
|
11284 |
+
{
|
11285 |
+
"epoch": 0.4,
|
11286 |
+
"grad_norm": 22.25,
|
11287 |
+
"learning_rate": 9.437288135593221e-06,
|
11288 |
+
"loss": 1.2988,
|
11289 |
+
"step": 16080
|
11290 |
+
},
|
11291 |
+
{
|
11292 |
+
"epoch": 0.4,
|
11293 |
+
"grad_norm": 29.375,
|
11294 |
+
"learning_rate": 9.430508474576273e-06,
|
11295 |
+
"loss": 1.1747,
|
11296 |
+
"step": 16090
|
11297 |
+
},
|
11298 |
+
{
|
11299 |
+
"epoch": 0.4,
|
11300 |
+
"grad_norm": 51.0,
|
11301 |
+
"learning_rate": 9.423728813559322e-06,
|
11302 |
+
"loss": 1.4036,
|
11303 |
+
"step": 16100
|
11304 |
+
},
|
11305 |
+
{
|
11306 |
+
"epoch": 0.4,
|
11307 |
+
"grad_norm": 20.625,
|
11308 |
+
"learning_rate": 9.416949152542374e-06,
|
11309 |
+
"loss": 1.4647,
|
11310 |
+
"step": 16110
|
11311 |
+
},
|
11312 |
+
{
|
11313 |
+
"epoch": 0.4,
|
11314 |
+
"grad_norm": 38.25,
|
11315 |
+
"learning_rate": 9.410169491525425e-06,
|
11316 |
+
"loss": 1.2687,
|
11317 |
+
"step": 16120
|
11318 |
+
},
|
11319 |
+
{
|
11320 |
+
"epoch": 0.4,
|
11321 |
+
"grad_norm": 21.25,
|
11322 |
+
"learning_rate": 9.403389830508477e-06,
|
11323 |
+
"loss": 1.2204,
|
11324 |
+
"step": 16130
|
11325 |
+
},
|
11326 |
+
{
|
11327 |
+
"epoch": 0.4,
|
11328 |
+
"grad_norm": 18.625,
|
11329 |
+
"learning_rate": 9.396610169491526e-06,
|
11330 |
+
"loss": 1.3104,
|
11331 |
+
"step": 16140
|
11332 |
+
},
|
11333 |
+
{
|
11334 |
+
"epoch": 0.4,
|
11335 |
+
"grad_norm": 16.375,
|
11336 |
+
"learning_rate": 9.389830508474576e-06,
|
11337 |
+
"loss": 1.3218,
|
11338 |
+
"step": 16150
|
11339 |
+
},
|
11340 |
+
{
|
11341 |
+
"epoch": 0.4,
|
11342 |
+
"grad_norm": 12.4375,
|
11343 |
+
"learning_rate": 9.383050847457627e-06,
|
11344 |
+
"loss": 1.5096,
|
11345 |
+
"step": 16160
|
11346 |
+
},
|
11347 |
+
{
|
11348 |
+
"epoch": 0.4,
|
11349 |
+
"grad_norm": 30.5,
|
11350 |
+
"learning_rate": 9.376271186440679e-06,
|
11351 |
+
"loss": 1.507,
|
11352 |
+
"step": 16170
|
11353 |
+
},
|
11354 |
+
{
|
11355 |
+
"epoch": 0.4,
|
11356 |
+
"grad_norm": 20.375,
|
11357 |
+
"learning_rate": 9.36949152542373e-06,
|
11358 |
+
"loss": 1.4757,
|
11359 |
+
"step": 16180
|
11360 |
+
},
|
11361 |
+
{
|
11362 |
+
"epoch": 0.4,
|
11363 |
+
"grad_norm": 20.0,
|
11364 |
+
"learning_rate": 9.36271186440678e-06,
|
11365 |
+
"loss": 1.2358,
|
11366 |
+
"step": 16190
|
11367 |
+
},
|
11368 |
+
{
|
11369 |
+
"epoch": 0.41,
|
11370 |
+
"grad_norm": 9.25,
|
11371 |
+
"learning_rate": 9.355932203389831e-06,
|
11372 |
+
"loss": 1.3804,
|
11373 |
+
"step": 16200
|
11374 |
+
},
|
11375 |
+
{
|
11376 |
+
"epoch": 0.41,
|
11377 |
+
"grad_norm": 15.875,
|
11378 |
+
"learning_rate": 9.349152542372883e-06,
|
11379 |
+
"loss": 1.4316,
|
11380 |
+
"step": 16210
|
11381 |
+
},
|
11382 |
+
{
|
11383 |
+
"epoch": 0.41,
|
11384 |
+
"grad_norm": 14.5,
|
11385 |
+
"learning_rate": 9.342372881355934e-06,
|
11386 |
+
"loss": 1.2718,
|
11387 |
+
"step": 16220
|
11388 |
+
},
|
11389 |
+
{
|
11390 |
+
"epoch": 0.41,
|
11391 |
+
"grad_norm": 19.25,
|
11392 |
+
"learning_rate": 9.335593220338984e-06,
|
11393 |
+
"loss": 1.2964,
|
11394 |
+
"step": 16230
|
11395 |
+
},
|
11396 |
+
{
|
11397 |
+
"epoch": 0.41,
|
11398 |
+
"grad_norm": 15.8125,
|
11399 |
+
"learning_rate": 9.328813559322034e-06,
|
11400 |
+
"loss": 1.4263,
|
11401 |
+
"step": 16240
|
11402 |
+
},
|
11403 |
+
{
|
11404 |
+
"epoch": 0.41,
|
11405 |
+
"grad_norm": 43.5,
|
11406 |
+
"learning_rate": 9.322033898305085e-06,
|
11407 |
+
"loss": 1.4527,
|
11408 |
+
"step": 16250
|
11409 |
+
},
|
11410 |
+
{
|
11411 |
+
"epoch": 0.41,
|
11412 |
+
"grad_norm": 16.125,
|
11413 |
+
"learning_rate": 9.315254237288136e-06,
|
11414 |
+
"loss": 1.2035,
|
11415 |
+
"step": 16260
|
11416 |
+
},
|
11417 |
+
{
|
11418 |
+
"epoch": 0.41,
|
11419 |
+
"grad_norm": 25.125,
|
11420 |
+
"learning_rate": 9.308474576271188e-06,
|
11421 |
+
"loss": 1.2739,
|
11422 |
+
"step": 16270
|
11423 |
+
},
|
11424 |
+
{
|
11425 |
+
"epoch": 0.41,
|
11426 |
+
"grad_norm": 10.0,
|
11427 |
+
"learning_rate": 9.301694915254237e-06,
|
11428 |
+
"loss": 1.2829,
|
11429 |
+
"step": 16280
|
11430 |
+
},
|
11431 |
+
{
|
11432 |
+
"epoch": 0.41,
|
11433 |
+
"grad_norm": 9.9375,
|
11434 |
+
"learning_rate": 9.294915254237289e-06,
|
11435 |
+
"loss": 1.4364,
|
11436 |
+
"step": 16290
|
11437 |
+
},
|
11438 |
+
{
|
11439 |
+
"epoch": 0.41,
|
11440 |
+
"grad_norm": 6.5625,
|
11441 |
+
"learning_rate": 9.28813559322034e-06,
|
11442 |
+
"loss": 1.1994,
|
11443 |
+
"step": 16300
|
11444 |
+
},
|
11445 |
+
{
|
11446 |
+
"epoch": 0.41,
|
11447 |
+
"grad_norm": 14.9375,
|
11448 |
+
"learning_rate": 9.28135593220339e-06,
|
11449 |
+
"loss": 1.4619,
|
11450 |
+
"step": 16310
|
11451 |
+
},
|
11452 |
+
{
|
11453 |
+
"epoch": 0.41,
|
11454 |
+
"grad_norm": 26.75,
|
11455 |
+
"learning_rate": 9.274576271186441e-06,
|
11456 |
+
"loss": 1.1941,
|
11457 |
+
"step": 16320
|
11458 |
+
},
|
11459 |
+
{
|
11460 |
+
"epoch": 0.41,
|
11461 |
+
"grad_norm": 20.125,
|
11462 |
+
"learning_rate": 9.267796610169493e-06,
|
11463 |
+
"loss": 1.3401,
|
11464 |
+
"step": 16330
|
11465 |
+
},
|
11466 |
+
{
|
11467 |
+
"epoch": 0.41,
|
11468 |
+
"grad_norm": 14.6875,
|
11469 |
+
"learning_rate": 9.261016949152544e-06,
|
11470 |
+
"loss": 1.2734,
|
11471 |
+
"step": 16340
|
11472 |
+
},
|
11473 |
+
{
|
11474 |
+
"epoch": 0.41,
|
11475 |
+
"grad_norm": 14.5,
|
11476 |
+
"learning_rate": 9.254237288135594e-06,
|
11477 |
+
"loss": 1.4338,
|
11478 |
+
"step": 16350
|
11479 |
+
},
|
11480 |
+
{
|
11481 |
+
"epoch": 0.41,
|
11482 |
+
"grad_norm": 22.75,
|
11483 |
+
"learning_rate": 9.247457627118645e-06,
|
11484 |
+
"loss": 1.2198,
|
11485 |
+
"step": 16360
|
11486 |
+
},
|
11487 |
+
{
|
11488 |
+
"epoch": 0.41,
|
11489 |
+
"grad_norm": 28.75,
|
11490 |
+
"learning_rate": 9.240677966101695e-06,
|
11491 |
+
"loss": 1.1812,
|
11492 |
+
"step": 16370
|
11493 |
+
},
|
11494 |
+
{
|
11495 |
+
"epoch": 0.41,
|
11496 |
+
"grad_norm": 13.9375,
|
11497 |
+
"learning_rate": 9.233898305084746e-06,
|
11498 |
+
"loss": 1.479,
|
11499 |
+
"step": 16380
|
11500 |
+
},
|
11501 |
+
{
|
11502 |
+
"epoch": 0.41,
|
11503 |
+
"grad_norm": 30.375,
|
11504 |
+
"learning_rate": 9.227118644067798e-06,
|
11505 |
+
"loss": 1.4046,
|
11506 |
+
"step": 16390
|
11507 |
+
},
|
11508 |
+
{
|
11509 |
+
"epoch": 0.41,
|
11510 |
+
"grad_norm": 12.125,
|
11511 |
+
"learning_rate": 9.220338983050847e-06,
|
11512 |
+
"loss": 1.4306,
|
11513 |
+
"step": 16400
|
11514 |
+
},
|
11515 |
+
{
|
11516 |
+
"epoch": 0.41,
|
11517 |
+
"grad_norm": 8.8125,
|
11518 |
+
"learning_rate": 9.213559322033899e-06,
|
11519 |
+
"loss": 1.4345,
|
11520 |
+
"step": 16410
|
11521 |
+
},
|
11522 |
+
{
|
11523 |
+
"epoch": 0.41,
|
11524 |
+
"grad_norm": 15.875,
|
11525 |
+
"learning_rate": 9.20677966101695e-06,
|
11526 |
+
"loss": 1.4688,
|
11527 |
+
"step": 16420
|
11528 |
+
},
|
11529 |
+
{
|
11530 |
+
"epoch": 0.41,
|
11531 |
+
"grad_norm": 28.125,
|
11532 |
+
"learning_rate": 9.200000000000002e-06,
|
11533 |
+
"loss": 1.1588,
|
11534 |
+
"step": 16430
|
11535 |
+
},
|
11536 |
+
{
|
11537 |
+
"epoch": 0.41,
|
11538 |
+
"grad_norm": 13.375,
|
11539 |
+
"learning_rate": 9.193220338983051e-06,
|
11540 |
+
"loss": 1.3002,
|
11541 |
+
"step": 16440
|
11542 |
+
},
|
11543 |
+
{
|
11544 |
+
"epoch": 0.41,
|
11545 |
+
"grad_norm": 22.75,
|
11546 |
+
"learning_rate": 9.186440677966101e-06,
|
11547 |
+
"loss": 1.5033,
|
11548 |
+
"step": 16450
|
11549 |
+
},
|
11550 |
+
{
|
11551 |
+
"epoch": 0.41,
|
11552 |
+
"grad_norm": 14.9375,
|
11553 |
+
"learning_rate": 9.179661016949153e-06,
|
11554 |
+
"loss": 1.5189,
|
11555 |
+
"step": 16460
|
11556 |
+
},
|
11557 |
+
{
|
11558 |
+
"epoch": 0.41,
|
11559 |
+
"grad_norm": 45.0,
|
11560 |
+
"learning_rate": 9.172881355932204e-06,
|
11561 |
+
"loss": 1.3223,
|
11562 |
+
"step": 16470
|
11563 |
+
},
|
11564 |
+
{
|
11565 |
+
"epoch": 0.41,
|
11566 |
+
"grad_norm": 34.25,
|
11567 |
+
"learning_rate": 9.166101694915255e-06,
|
11568 |
+
"loss": 1.4807,
|
11569 |
+
"step": 16480
|
11570 |
+
},
|
11571 |
+
{
|
11572 |
+
"epoch": 0.41,
|
11573 |
+
"grad_norm": 10.125,
|
11574 |
+
"learning_rate": 9.159322033898305e-06,
|
11575 |
+
"loss": 1.2578,
|
11576 |
+
"step": 16490
|
11577 |
+
},
|
11578 |
+
{
|
11579 |
+
"epoch": 0.41,
|
11580 |
+
"grad_norm": 41.5,
|
11581 |
+
"learning_rate": 9.152542372881356e-06,
|
11582 |
+
"loss": 1.3199,
|
11583 |
+
"step": 16500
|
11584 |
+
},
|
11585 |
+
{
|
11586 |
+
"epoch": 0.41,
|
11587 |
+
"grad_norm": 27.5,
|
11588 |
+
"learning_rate": 9.145762711864408e-06,
|
11589 |
+
"loss": 1.5607,
|
11590 |
+
"step": 16510
|
11591 |
+
},
|
11592 |
+
{
|
11593 |
+
"epoch": 0.41,
|
11594 |
+
"grad_norm": 15.625,
|
11595 |
+
"learning_rate": 9.13898305084746e-06,
|
11596 |
+
"loss": 1.3342,
|
11597 |
+
"step": 16520
|
11598 |
+
},
|
11599 |
+
{
|
11600 |
+
"epoch": 0.41,
|
11601 |
+
"grad_norm": 16.125,
|
11602 |
+
"learning_rate": 9.132203389830509e-06,
|
11603 |
+
"loss": 1.4272,
|
11604 |
+
"step": 16530
|
11605 |
+
},
|
11606 |
+
{
|
11607 |
+
"epoch": 0.41,
|
11608 |
+
"grad_norm": 25.875,
|
11609 |
+
"learning_rate": 9.12542372881356e-06,
|
11610 |
+
"loss": 1.2709,
|
11611 |
+
"step": 16540
|
11612 |
+
},
|
11613 |
+
{
|
11614 |
+
"epoch": 0.41,
|
11615 |
+
"grad_norm": 39.0,
|
11616 |
+
"learning_rate": 9.11864406779661e-06,
|
11617 |
+
"loss": 1.3328,
|
11618 |
+
"step": 16550
|
11619 |
+
},
|
11620 |
+
{
|
11621 |
+
"epoch": 0.41,
|
11622 |
+
"grad_norm": 14.3125,
|
11623 |
+
"learning_rate": 9.111864406779661e-06,
|
11624 |
+
"loss": 1.4916,
|
11625 |
+
"step": 16560
|
11626 |
+
},
|
11627 |
+
{
|
11628 |
+
"epoch": 0.41,
|
11629 |
+
"grad_norm": 17.875,
|
11630 |
+
"learning_rate": 9.105084745762713e-06,
|
11631 |
+
"loss": 1.3991,
|
11632 |
+
"step": 16570
|
11633 |
+
},
|
11634 |
+
{
|
11635 |
+
"epoch": 0.41,
|
11636 |
+
"grad_norm": 62.75,
|
11637 |
+
"learning_rate": 9.098305084745763e-06,
|
11638 |
+
"loss": 1.5126,
|
11639 |
+
"step": 16580
|
11640 |
+
},
|
11641 |
+
{
|
11642 |
+
"epoch": 0.41,
|
11643 |
+
"grad_norm": 30.875,
|
11644 |
+
"learning_rate": 9.091525423728814e-06,
|
11645 |
+
"loss": 1.4149,
|
11646 |
+
"step": 16590
|
11647 |
+
},
|
11648 |
+
{
|
11649 |
+
"epoch": 0.41,
|
11650 |
+
"grad_norm": 15.0625,
|
11651 |
+
"learning_rate": 9.084745762711865e-06,
|
11652 |
+
"loss": 1.2595,
|
11653 |
+
"step": 16600
|
11654 |
+
},
|
11655 |
+
{
|
11656 |
+
"epoch": 0.42,
|
11657 |
+
"grad_norm": 14.9375,
|
11658 |
+
"learning_rate": 9.077966101694917e-06,
|
11659 |
+
"loss": 1.3483,
|
11660 |
+
"step": 16610
|
11661 |
+
},
|
11662 |
+
{
|
11663 |
+
"epoch": 0.42,
|
11664 |
+
"grad_norm": 22.125,
|
11665 |
+
"learning_rate": 9.071186440677966e-06,
|
11666 |
+
"loss": 1.4082,
|
11667 |
+
"step": 16620
|
11668 |
+
},
|
11669 |
+
{
|
11670 |
+
"epoch": 0.42,
|
11671 |
+
"grad_norm": 10.5,
|
11672 |
+
"learning_rate": 9.064406779661018e-06,
|
11673 |
+
"loss": 1.5419,
|
11674 |
+
"step": 16630
|
11675 |
+
},
|
11676 |
+
{
|
11677 |
+
"epoch": 0.42,
|
11678 |
+
"grad_norm": 30.625,
|
11679 |
+
"learning_rate": 9.05762711864407e-06,
|
11680 |
+
"loss": 1.3988,
|
11681 |
+
"step": 16640
|
11682 |
+
},
|
11683 |
+
{
|
11684 |
+
"epoch": 0.42,
|
11685 |
+
"grad_norm": 15.75,
|
11686 |
+
"learning_rate": 9.05084745762712e-06,
|
11687 |
+
"loss": 1.377,
|
11688 |
+
"step": 16650
|
11689 |
+
},
|
11690 |
+
{
|
11691 |
+
"epoch": 0.42,
|
11692 |
+
"grad_norm": 14.8125,
|
11693 |
+
"learning_rate": 9.04406779661017e-06,
|
11694 |
+
"loss": 1.2976,
|
11695 |
+
"step": 16660
|
11696 |
+
},
|
11697 |
+
{
|
11698 |
+
"epoch": 0.42,
|
11699 |
+
"grad_norm": 27.75,
|
11700 |
+
"learning_rate": 9.03728813559322e-06,
|
11701 |
+
"loss": 1.3353,
|
11702 |
+
"step": 16670
|
11703 |
+
},
|
11704 |
+
{
|
11705 |
+
"epoch": 0.42,
|
11706 |
+
"grad_norm": 28.0,
|
11707 |
+
"learning_rate": 9.030508474576271e-06,
|
11708 |
+
"loss": 1.4809,
|
11709 |
+
"step": 16680
|
11710 |
+
},
|
11711 |
+
{
|
11712 |
+
"epoch": 0.42,
|
11713 |
+
"grad_norm": 29.25,
|
11714 |
+
"learning_rate": 9.023728813559323e-06,
|
11715 |
+
"loss": 1.2022,
|
11716 |
+
"step": 16690
|
11717 |
+
},
|
11718 |
+
{
|
11719 |
+
"epoch": 0.42,
|
11720 |
+
"grad_norm": 15.1875,
|
11721 |
+
"learning_rate": 9.016949152542374e-06,
|
11722 |
+
"loss": 1.4174,
|
11723 |
+
"step": 16700
|
11724 |
+
},
|
11725 |
+
{
|
11726 |
+
"epoch": 0.42,
|
11727 |
+
"grad_norm": 27.75,
|
11728 |
+
"learning_rate": 9.010169491525424e-06,
|
11729 |
+
"loss": 1.2896,
|
11730 |
+
"step": 16710
|
11731 |
+
},
|
11732 |
+
{
|
11733 |
+
"epoch": 0.42,
|
11734 |
+
"grad_norm": 25.875,
|
11735 |
+
"learning_rate": 9.003389830508475e-06,
|
11736 |
+
"loss": 1.3355,
|
11737 |
+
"step": 16720
|
11738 |
+
},
|
11739 |
+
{
|
11740 |
+
"epoch": 0.42,
|
11741 |
+
"grad_norm": 22.625,
|
11742 |
+
"learning_rate": 8.996610169491527e-06,
|
11743 |
+
"loss": 1.3372,
|
11744 |
+
"step": 16730
|
11745 |
+
},
|
11746 |
+
{
|
11747 |
+
"epoch": 0.42,
|
11748 |
+
"grad_norm": 19.375,
|
11749 |
+
"learning_rate": 8.989830508474578e-06,
|
11750 |
+
"loss": 1.3901,
|
11751 |
+
"step": 16740
|
11752 |
+
},
|
11753 |
+
{
|
11754 |
+
"epoch": 0.42,
|
11755 |
+
"grad_norm": 12.8125,
|
11756 |
+
"learning_rate": 8.983050847457628e-06,
|
11757 |
+
"loss": 1.3096,
|
11758 |
+
"step": 16750
|
11759 |
+
},
|
11760 |
+
{
|
11761 |
+
"epoch": 0.42,
|
11762 |
+
"grad_norm": 13.9375,
|
11763 |
+
"learning_rate": 8.976271186440678e-06,
|
11764 |
+
"loss": 1.5191,
|
11765 |
+
"step": 16760
|
11766 |
+
},
|
11767 |
+
{
|
11768 |
+
"epoch": 0.42,
|
11769 |
+
"grad_norm": 23.125,
|
11770 |
+
"learning_rate": 8.969491525423729e-06,
|
11771 |
+
"loss": 1.4531,
|
11772 |
+
"step": 16770
|
11773 |
+
},
|
11774 |
+
{
|
11775 |
+
"epoch": 0.42,
|
11776 |
+
"grad_norm": 25.875,
|
11777 |
+
"learning_rate": 8.96271186440678e-06,
|
11778 |
+
"loss": 1.5546,
|
11779 |
+
"step": 16780
|
11780 |
+
},
|
11781 |
+
{
|
11782 |
+
"epoch": 0.42,
|
11783 |
+
"grad_norm": 26.125,
|
11784 |
+
"learning_rate": 8.955932203389832e-06,
|
11785 |
+
"loss": 1.1527,
|
11786 |
+
"step": 16790
|
11787 |
+
},
|
11788 |
+
{
|
11789 |
+
"epoch": 0.42,
|
11790 |
+
"grad_norm": 32.75,
|
11791 |
+
"learning_rate": 8.949152542372881e-06,
|
11792 |
+
"loss": 1.3397,
|
11793 |
+
"step": 16800
|
11794 |
+
},
|
11795 |
+
{
|
11796 |
+
"epoch": 0.42,
|
11797 |
+
"grad_norm": 30.75,
|
11798 |
+
"learning_rate": 8.942372881355933e-06,
|
11799 |
+
"loss": 1.3663,
|
11800 |
+
"step": 16810
|
11801 |
+
},
|
11802 |
+
{
|
11803 |
+
"epoch": 0.42,
|
11804 |
+
"grad_norm": 13.5625,
|
11805 |
+
"learning_rate": 8.935593220338984e-06,
|
11806 |
+
"loss": 1.3837,
|
11807 |
+
"step": 16820
|
11808 |
+
},
|
11809 |
+
{
|
11810 |
+
"epoch": 0.42,
|
11811 |
+
"grad_norm": 12.6875,
|
11812 |
+
"learning_rate": 8.928813559322036e-06,
|
11813 |
+
"loss": 1.4386,
|
11814 |
+
"step": 16830
|
11815 |
+
},
|
11816 |
+
{
|
11817 |
+
"epoch": 0.42,
|
11818 |
+
"grad_norm": 24.75,
|
11819 |
+
"learning_rate": 8.922033898305085e-06,
|
11820 |
+
"loss": 1.4903,
|
11821 |
+
"step": 16840
|
11822 |
+
},
|
11823 |
+
{
|
11824 |
+
"epoch": 0.42,
|
11825 |
+
"grad_norm": 19.0,
|
11826 |
+
"learning_rate": 8.915254237288137e-06,
|
11827 |
+
"loss": 1.4548,
|
11828 |
+
"step": 16850
|
11829 |
+
},
|
11830 |
+
{
|
11831 |
+
"epoch": 0.42,
|
11832 |
+
"grad_norm": 26.25,
|
11833 |
+
"learning_rate": 8.908474576271188e-06,
|
11834 |
+
"loss": 1.251,
|
11835 |
+
"step": 16860
|
11836 |
+
},
|
11837 |
+
{
|
11838 |
+
"epoch": 0.42,
|
11839 |
+
"grad_norm": 20.375,
|
11840 |
+
"learning_rate": 8.901694915254238e-06,
|
11841 |
+
"loss": 1.4531,
|
11842 |
+
"step": 16870
|
11843 |
+
},
|
11844 |
+
{
|
11845 |
+
"epoch": 0.42,
|
11846 |
+
"grad_norm": 20.375,
|
11847 |
+
"learning_rate": 8.89491525423729e-06,
|
11848 |
+
"loss": 1.2677,
|
11849 |
+
"step": 16880
|
11850 |
+
},
|
11851 |
+
{
|
11852 |
+
"epoch": 0.42,
|
11853 |
+
"grad_norm": 22.875,
|
11854 |
+
"learning_rate": 8.888135593220339e-06,
|
11855 |
+
"loss": 1.3289,
|
11856 |
+
"step": 16890
|
11857 |
+
},
|
11858 |
+
{
|
11859 |
+
"epoch": 0.42,
|
11860 |
+
"grad_norm": 26.125,
|
11861 |
+
"learning_rate": 8.88135593220339e-06,
|
11862 |
+
"loss": 1.5637,
|
11863 |
+
"step": 16900
|
11864 |
+
},
|
11865 |
+
{
|
11866 |
+
"epoch": 0.42,
|
11867 |
+
"grad_norm": 47.0,
|
11868 |
+
"learning_rate": 8.874576271186442e-06,
|
11869 |
+
"loss": 1.3166,
|
11870 |
+
"step": 16910
|
11871 |
+
},
|
11872 |
+
{
|
11873 |
+
"epoch": 0.42,
|
11874 |
+
"grad_norm": 19.25,
|
11875 |
+
"learning_rate": 8.867796610169492e-06,
|
11876 |
+
"loss": 1.2822,
|
11877 |
+
"step": 16920
|
11878 |
+
},
|
11879 |
+
{
|
11880 |
+
"epoch": 0.42,
|
11881 |
+
"grad_norm": 34.25,
|
11882 |
+
"learning_rate": 8.861016949152543e-06,
|
11883 |
+
"loss": 1.3943,
|
11884 |
+
"step": 16930
|
11885 |
+
},
|
11886 |
+
{
|
11887 |
+
"epoch": 0.42,
|
11888 |
+
"grad_norm": 10.125,
|
11889 |
+
"learning_rate": 8.854237288135594e-06,
|
11890 |
+
"loss": 1.3883,
|
11891 |
+
"step": 16940
|
11892 |
+
},
|
11893 |
+
{
|
11894 |
+
"epoch": 0.42,
|
11895 |
+
"grad_norm": 33.25,
|
11896 |
+
"learning_rate": 8.847457627118646e-06,
|
11897 |
+
"loss": 1.3113,
|
11898 |
+
"step": 16950
|
11899 |
+
},
|
11900 |
+
{
|
11901 |
+
"epoch": 0.42,
|
11902 |
+
"grad_norm": 27.75,
|
11903 |
+
"learning_rate": 8.840677966101695e-06,
|
11904 |
+
"loss": 1.3963,
|
11905 |
+
"step": 16960
|
11906 |
+
},
|
11907 |
+
{
|
11908 |
+
"epoch": 0.42,
|
11909 |
+
"grad_norm": 21.5,
|
11910 |
+
"learning_rate": 8.833898305084747e-06,
|
11911 |
+
"loss": 1.5098,
|
11912 |
+
"step": 16970
|
11913 |
+
},
|
11914 |
+
{
|
11915 |
+
"epoch": 0.42,
|
11916 |
+
"grad_norm": 20.0,
|
11917 |
+
"learning_rate": 8.827118644067797e-06,
|
11918 |
+
"loss": 1.4349,
|
11919 |
+
"step": 16980
|
11920 |
+
},
|
11921 |
+
{
|
11922 |
+
"epoch": 0.42,
|
11923 |
+
"grad_norm": 22.25,
|
11924 |
+
"learning_rate": 8.820338983050848e-06,
|
11925 |
+
"loss": 1.4513,
|
11926 |
+
"step": 16990
|
11927 |
+
},
|
11928 |
+
{
|
11929 |
+
"epoch": 0.42,
|
11930 |
+
"grad_norm": 9.5,
|
11931 |
+
"learning_rate": 8.8135593220339e-06,
|
11932 |
+
"loss": 1.1621,
|
11933 |
+
"step": 17000
|
11934 |
+
},
|
11935 |
+
{
|
11936 |
+
"epoch": 0.43,
|
11937 |
+
"grad_norm": 13.8125,
|
11938 |
+
"learning_rate": 8.806779661016949e-06,
|
11939 |
+
"loss": 1.2259,
|
11940 |
+
"step": 17010
|
11941 |
+
},
|
11942 |
+
{
|
11943 |
+
"epoch": 0.43,
|
11944 |
+
"grad_norm": 14.1875,
|
11945 |
+
"learning_rate": 8.8e-06,
|
11946 |
+
"loss": 1.2985,
|
11947 |
+
"step": 17020
|
11948 |
+
},
|
11949 |
+
{
|
11950 |
+
"epoch": 0.43,
|
11951 |
+
"grad_norm": 36.75,
|
11952 |
+
"learning_rate": 8.793220338983052e-06,
|
11953 |
+
"loss": 1.4038,
|
11954 |
+
"step": 17030
|
11955 |
+
},
|
11956 |
+
{
|
11957 |
+
"epoch": 0.43,
|
11958 |
+
"grad_norm": 11.875,
|
11959 |
+
"learning_rate": 8.786440677966103e-06,
|
11960 |
+
"loss": 1.1619,
|
11961 |
+
"step": 17040
|
11962 |
+
},
|
11963 |
+
{
|
11964 |
+
"epoch": 0.43,
|
11965 |
+
"grad_norm": 16.25,
|
11966 |
+
"learning_rate": 8.779661016949153e-06,
|
11967 |
+
"loss": 1.429,
|
11968 |
+
"step": 17050
|
11969 |
+
},
|
11970 |
+
{
|
11971 |
+
"epoch": 0.43,
|
11972 |
+
"grad_norm": 22.75,
|
11973 |
+
"learning_rate": 8.772881355932204e-06,
|
11974 |
+
"loss": 1.06,
|
11975 |
+
"step": 17060
|
11976 |
+
},
|
11977 |
+
{
|
11978 |
+
"epoch": 0.43,
|
11979 |
+
"grad_norm": 11.5,
|
11980 |
+
"learning_rate": 8.766101694915254e-06,
|
11981 |
+
"loss": 1.4852,
|
11982 |
+
"step": 17070
|
11983 |
+
},
|
11984 |
+
{
|
11985 |
+
"epoch": 0.43,
|
11986 |
+
"grad_norm": 29.625,
|
11987 |
+
"learning_rate": 8.759322033898305e-06,
|
11988 |
+
"loss": 1.2247,
|
11989 |
+
"step": 17080
|
11990 |
+
},
|
11991 |
+
{
|
11992 |
+
"epoch": 0.43,
|
11993 |
+
"grad_norm": 12.1875,
|
11994 |
+
"learning_rate": 8.752542372881357e-06,
|
11995 |
+
"loss": 1.3069,
|
11996 |
+
"step": 17090
|
11997 |
+
},
|
11998 |
+
{
|
11999 |
+
"epoch": 0.43,
|
12000 |
+
"grad_norm": 21.875,
|
12001 |
+
"learning_rate": 8.745762711864407e-06,
|
12002 |
+
"loss": 1.3806,
|
12003 |
+
"step": 17100
|
12004 |
+
},
|
12005 |
+
{
|
12006 |
+
"epoch": 0.43,
|
12007 |
+
"grad_norm": 11.25,
|
12008 |
+
"learning_rate": 8.738983050847458e-06,
|
12009 |
+
"loss": 1.2146,
|
12010 |
+
"step": 17110
|
12011 |
+
},
|
12012 |
+
{
|
12013 |
+
"epoch": 0.43,
|
12014 |
+
"grad_norm": 30.0,
|
12015 |
+
"learning_rate": 8.73220338983051e-06,
|
12016 |
+
"loss": 1.2134,
|
12017 |
+
"step": 17120
|
12018 |
+
},
|
12019 |
+
{
|
12020 |
+
"epoch": 0.43,
|
12021 |
+
"grad_norm": 16.5,
|
12022 |
+
"learning_rate": 8.72542372881356e-06,
|
12023 |
+
"loss": 1.479,
|
12024 |
+
"step": 17130
|
12025 |
+
},
|
12026 |
+
{
|
12027 |
+
"epoch": 0.43,
|
12028 |
+
"grad_norm": 21.0,
|
12029 |
+
"learning_rate": 8.71864406779661e-06,
|
12030 |
+
"loss": 1.2287,
|
12031 |
+
"step": 17140
|
12032 |
+
},
|
12033 |
+
{
|
12034 |
+
"epoch": 0.43,
|
12035 |
+
"grad_norm": 29.0,
|
12036 |
+
"learning_rate": 8.711864406779662e-06,
|
12037 |
+
"loss": 1.3519,
|
12038 |
+
"step": 17150
|
12039 |
+
},
|
12040 |
+
{
|
12041 |
+
"epoch": 0.43,
|
12042 |
+
"grad_norm": 35.0,
|
12043 |
+
"learning_rate": 8.705084745762713e-06,
|
12044 |
+
"loss": 1.2435,
|
12045 |
+
"step": 17160
|
12046 |
+
},
|
12047 |
+
{
|
12048 |
+
"epoch": 0.43,
|
12049 |
+
"grad_norm": 22.625,
|
12050 |
+
"learning_rate": 8.698305084745765e-06,
|
12051 |
+
"loss": 1.2609,
|
12052 |
+
"step": 17170
|
12053 |
+
},
|
12054 |
+
{
|
12055 |
+
"epoch": 0.43,
|
12056 |
+
"grad_norm": 47.5,
|
12057 |
+
"learning_rate": 8.691525423728814e-06,
|
12058 |
+
"loss": 1.2522,
|
12059 |
+
"step": 17180
|
12060 |
+
},
|
12061 |
+
{
|
12062 |
+
"epoch": 0.43,
|
12063 |
+
"grad_norm": 20.875,
|
12064 |
+
"learning_rate": 8.684745762711864e-06,
|
12065 |
+
"loss": 1.2606,
|
12066 |
+
"step": 17190
|
12067 |
+
},
|
12068 |
+
{
|
12069 |
+
"epoch": 0.43,
|
12070 |
+
"grad_norm": 38.0,
|
12071 |
+
"learning_rate": 8.677966101694915e-06,
|
12072 |
+
"loss": 1.3564,
|
12073 |
+
"step": 17200
|
12074 |
+
},
|
12075 |
+
{
|
12076 |
+
"epoch": 0.43,
|
12077 |
+
"grad_norm": 13.0625,
|
12078 |
+
"learning_rate": 8.671186440677967e-06,
|
12079 |
+
"loss": 1.4235,
|
12080 |
+
"step": 17210
|
12081 |
+
},
|
12082 |
+
{
|
12083 |
+
"epoch": 0.43,
|
12084 |
+
"grad_norm": 24.875,
|
12085 |
+
"learning_rate": 8.664406779661018e-06,
|
12086 |
+
"loss": 1.1742,
|
12087 |
+
"step": 17220
|
12088 |
+
},
|
12089 |
+
{
|
12090 |
+
"epoch": 0.43,
|
12091 |
+
"grad_norm": 52.0,
|
12092 |
+
"learning_rate": 8.657627118644068e-06,
|
12093 |
+
"loss": 1.3956,
|
12094 |
+
"step": 17230
|
12095 |
+
},
|
12096 |
+
{
|
12097 |
+
"epoch": 0.43,
|
12098 |
+
"grad_norm": 29.875,
|
12099 |
+
"learning_rate": 8.65084745762712e-06,
|
12100 |
+
"loss": 1.3366,
|
12101 |
+
"step": 17240
|
12102 |
+
},
|
12103 |
+
{
|
12104 |
+
"epoch": 0.43,
|
12105 |
+
"grad_norm": 19.75,
|
12106 |
+
"learning_rate": 8.64406779661017e-06,
|
12107 |
+
"loss": 1.4179,
|
12108 |
+
"step": 17250
|
12109 |
+
},
|
12110 |
+
{
|
12111 |
+
"epoch": 0.43,
|
12112 |
+
"grad_norm": 33.0,
|
12113 |
+
"learning_rate": 8.637288135593222e-06,
|
12114 |
+
"loss": 1.4633,
|
12115 |
+
"step": 17260
|
12116 |
+
},
|
12117 |
+
{
|
12118 |
+
"epoch": 0.43,
|
12119 |
+
"grad_norm": 24.875,
|
12120 |
+
"learning_rate": 8.630508474576272e-06,
|
12121 |
+
"loss": 1.3481,
|
12122 |
+
"step": 17270
|
12123 |
+
},
|
12124 |
+
{
|
12125 |
+
"epoch": 0.43,
|
12126 |
+
"grad_norm": 14.25,
|
12127 |
+
"learning_rate": 8.623728813559322e-06,
|
12128 |
+
"loss": 1.3081,
|
12129 |
+
"step": 17280
|
12130 |
+
},
|
12131 |
+
{
|
12132 |
+
"epoch": 0.43,
|
12133 |
+
"grad_norm": 21.125,
|
12134 |
+
"learning_rate": 8.616949152542373e-06,
|
12135 |
+
"loss": 1.5585,
|
12136 |
+
"step": 17290
|
12137 |
+
},
|
12138 |
+
{
|
12139 |
+
"epoch": 0.43,
|
12140 |
+
"grad_norm": 14.1875,
|
12141 |
+
"learning_rate": 8.610169491525424e-06,
|
12142 |
+
"loss": 1.4156,
|
12143 |
+
"step": 17300
|
12144 |
+
},
|
12145 |
+
{
|
12146 |
+
"epoch": 0.43,
|
12147 |
+
"grad_norm": 20.25,
|
12148 |
+
"learning_rate": 8.603389830508476e-06,
|
12149 |
+
"loss": 1.255,
|
12150 |
+
"step": 17310
|
12151 |
+
},
|
12152 |
+
{
|
12153 |
+
"epoch": 0.43,
|
12154 |
+
"grad_norm": 34.0,
|
12155 |
+
"learning_rate": 8.596610169491526e-06,
|
12156 |
+
"loss": 1.3487,
|
12157 |
+
"step": 17320
|
12158 |
+
},
|
12159 |
+
{
|
12160 |
+
"epoch": 0.43,
|
12161 |
+
"grad_norm": 18.875,
|
12162 |
+
"learning_rate": 8.589830508474577e-06,
|
12163 |
+
"loss": 1.28,
|
12164 |
+
"step": 17330
|
12165 |
+
},
|
12166 |
+
{
|
12167 |
+
"epoch": 0.43,
|
12168 |
+
"grad_norm": 10.0625,
|
12169 |
+
"learning_rate": 8.583050847457628e-06,
|
12170 |
+
"loss": 1.3084,
|
12171 |
+
"step": 17340
|
12172 |
+
},
|
12173 |
+
{
|
12174 |
+
"epoch": 0.43,
|
12175 |
+
"grad_norm": 44.75,
|
12176 |
+
"learning_rate": 8.57627118644068e-06,
|
12177 |
+
"loss": 1.2927,
|
12178 |
+
"step": 17350
|
12179 |
+
},
|
12180 |
+
{
|
12181 |
+
"epoch": 0.43,
|
12182 |
+
"grad_norm": 23.75,
|
12183 |
+
"learning_rate": 8.56949152542373e-06,
|
12184 |
+
"loss": 1.2896,
|
12185 |
+
"step": 17360
|
12186 |
+
},
|
12187 |
+
{
|
12188 |
+
"epoch": 0.43,
|
12189 |
+
"grad_norm": 9.25,
|
12190 |
+
"learning_rate": 8.56271186440678e-06,
|
12191 |
+
"loss": 1.3518,
|
12192 |
+
"step": 17370
|
12193 |
+
},
|
12194 |
+
{
|
12195 |
+
"epoch": 0.43,
|
12196 |
+
"grad_norm": 15.0625,
|
12197 |
+
"learning_rate": 8.55593220338983e-06,
|
12198 |
+
"loss": 1.4414,
|
12199 |
+
"step": 17380
|
12200 |
+
},
|
12201 |
+
{
|
12202 |
+
"epoch": 0.43,
|
12203 |
+
"grad_norm": 23.125,
|
12204 |
+
"learning_rate": 8.549152542372882e-06,
|
12205 |
+
"loss": 1.4226,
|
12206 |
+
"step": 17390
|
12207 |
+
},
|
12208 |
+
{
|
12209 |
+
"epoch": 0.43,
|
12210 |
+
"grad_norm": 25.25,
|
12211 |
+
"learning_rate": 8.542372881355933e-06,
|
12212 |
+
"loss": 1.5472,
|
12213 |
+
"step": 17400
|
12214 |
+
},
|
12215 |
+
{
|
12216 |
+
"epoch": 0.44,
|
12217 |
+
"grad_norm": 25.375,
|
12218 |
+
"learning_rate": 8.535593220338983e-06,
|
12219 |
+
"loss": 1.2825,
|
12220 |
+
"step": 17410
|
12221 |
+
},
|
12222 |
+
{
|
12223 |
+
"epoch": 0.44,
|
12224 |
+
"grad_norm": 13.1875,
|
12225 |
+
"learning_rate": 8.528813559322034e-06,
|
12226 |
+
"loss": 1.3777,
|
12227 |
+
"step": 17420
|
12228 |
+
},
|
12229 |
+
{
|
12230 |
+
"epoch": 0.44,
|
12231 |
+
"grad_norm": 22.75,
|
12232 |
+
"learning_rate": 8.522033898305086e-06,
|
12233 |
+
"loss": 1.3355,
|
12234 |
+
"step": 17430
|
12235 |
+
},
|
12236 |
+
{
|
12237 |
+
"epoch": 0.44,
|
12238 |
+
"grad_norm": 13.4375,
|
12239 |
+
"learning_rate": 8.515254237288136e-06,
|
12240 |
+
"loss": 1.4221,
|
12241 |
+
"step": 17440
|
12242 |
+
},
|
12243 |
+
{
|
12244 |
+
"epoch": 0.44,
|
12245 |
+
"grad_norm": 32.5,
|
12246 |
+
"learning_rate": 8.508474576271187e-06,
|
12247 |
+
"loss": 1.302,
|
12248 |
+
"step": 17450
|
12249 |
+
},
|
12250 |
+
{
|
12251 |
+
"epoch": 0.44,
|
12252 |
+
"grad_norm": 17.125,
|
12253 |
+
"learning_rate": 8.501694915254238e-06,
|
12254 |
+
"loss": 1.1499,
|
12255 |
+
"step": 17460
|
12256 |
+
},
|
12257 |
+
{
|
12258 |
+
"epoch": 0.44,
|
12259 |
+
"grad_norm": 32.0,
|
12260 |
+
"learning_rate": 8.49491525423729e-06,
|
12261 |
+
"loss": 1.1878,
|
12262 |
+
"step": 17470
|
12263 |
+
},
|
12264 |
+
{
|
12265 |
+
"epoch": 0.44,
|
12266 |
+
"grad_norm": 46.25,
|
12267 |
+
"learning_rate": 8.48813559322034e-06,
|
12268 |
+
"loss": 1.5593,
|
12269 |
+
"step": 17480
|
12270 |
+
},
|
12271 |
+
{
|
12272 |
+
"epoch": 0.44,
|
12273 |
+
"grad_norm": 21.0,
|
12274 |
+
"learning_rate": 8.481355932203391e-06,
|
12275 |
+
"loss": 1.3942,
|
12276 |
+
"step": 17490
|
12277 |
+
},
|
12278 |
+
{
|
12279 |
+
"epoch": 0.44,
|
12280 |
+
"grad_norm": 11.6875,
|
12281 |
+
"learning_rate": 8.47457627118644e-06,
|
12282 |
+
"loss": 1.4905,
|
12283 |
+
"step": 17500
|
12284 |
+
},
|
12285 |
+
{
|
12286 |
+
"epoch": 0.44,
|
12287 |
+
"grad_norm": 25.875,
|
12288 |
+
"learning_rate": 8.467796610169492e-06,
|
12289 |
+
"loss": 1.327,
|
12290 |
+
"step": 17510
|
12291 |
+
},
|
12292 |
+
{
|
12293 |
+
"epoch": 0.44,
|
12294 |
+
"grad_norm": 10.125,
|
12295 |
+
"learning_rate": 8.461016949152543e-06,
|
12296 |
+
"loss": 1.4815,
|
12297 |
+
"step": 17520
|
12298 |
+
},
|
12299 |
+
{
|
12300 |
+
"epoch": 0.44,
|
12301 |
+
"grad_norm": 17.25,
|
12302 |
+
"learning_rate": 8.454237288135593e-06,
|
12303 |
+
"loss": 1.3061,
|
12304 |
+
"step": 17530
|
12305 |
+
},
|
12306 |
+
{
|
12307 |
+
"epoch": 0.44,
|
12308 |
+
"grad_norm": 13.25,
|
12309 |
+
"learning_rate": 8.447457627118644e-06,
|
12310 |
+
"loss": 1.2559,
|
12311 |
+
"step": 17540
|
12312 |
+
},
|
12313 |
+
{
|
12314 |
+
"epoch": 0.44,
|
12315 |
+
"grad_norm": 8.125,
|
12316 |
+
"learning_rate": 8.440677966101696e-06,
|
12317 |
+
"loss": 1.3552,
|
12318 |
+
"step": 17550
|
12319 |
+
},
|
12320 |
+
{
|
12321 |
+
"epoch": 0.44,
|
12322 |
+
"grad_norm": 8.625,
|
12323 |
+
"learning_rate": 8.433898305084747e-06,
|
12324 |
+
"loss": 1.3366,
|
12325 |
+
"step": 17560
|
12326 |
+
},
|
12327 |
+
{
|
12328 |
+
"epoch": 0.44,
|
12329 |
+
"grad_norm": 25.5,
|
12330 |
+
"learning_rate": 8.427118644067797e-06,
|
12331 |
+
"loss": 1.3192,
|
12332 |
+
"step": 17570
|
12333 |
+
},
|
12334 |
+
{
|
12335 |
+
"epoch": 0.44,
|
12336 |
+
"grad_norm": 35.75,
|
12337 |
+
"learning_rate": 8.420338983050848e-06,
|
12338 |
+
"loss": 1.3305,
|
12339 |
+
"step": 17580
|
12340 |
+
},
|
12341 |
+
{
|
12342 |
+
"epoch": 0.44,
|
12343 |
+
"grad_norm": 22.0,
|
12344 |
+
"learning_rate": 8.413559322033898e-06,
|
12345 |
+
"loss": 1.2914,
|
12346 |
+
"step": 17590
|
12347 |
+
},
|
12348 |
+
{
|
12349 |
+
"epoch": 0.44,
|
12350 |
+
"grad_norm": 51.75,
|
12351 |
+
"learning_rate": 8.40677966101695e-06,
|
12352 |
+
"loss": 1.3449,
|
12353 |
+
"step": 17600
|
12354 |
+
},
|
12355 |
+
{
|
12356 |
+
"epoch": 0.44,
|
12357 |
+
"grad_norm": 55.0,
|
12358 |
+
"learning_rate": 8.400000000000001e-06,
|
12359 |
+
"loss": 1.2464,
|
12360 |
+
"step": 17610
|
12361 |
+
},
|
12362 |
+
{
|
12363 |
+
"epoch": 0.44,
|
12364 |
+
"grad_norm": 20.625,
|
12365 |
+
"learning_rate": 8.39322033898305e-06,
|
12366 |
+
"loss": 1.4602,
|
12367 |
+
"step": 17620
|
12368 |
+
},
|
12369 |
+
{
|
12370 |
+
"epoch": 0.44,
|
12371 |
+
"grad_norm": 41.25,
|
12372 |
+
"learning_rate": 8.386440677966102e-06,
|
12373 |
+
"loss": 1.3996,
|
12374 |
+
"step": 17630
|
12375 |
+
},
|
12376 |
+
{
|
12377 |
+
"epoch": 0.44,
|
12378 |
+
"grad_norm": 10.875,
|
12379 |
+
"learning_rate": 8.379661016949153e-06,
|
12380 |
+
"loss": 1.3882,
|
12381 |
+
"step": 17640
|
12382 |
+
},
|
12383 |
+
{
|
12384 |
+
"epoch": 0.44,
|
12385 |
+
"grad_norm": 28.25,
|
12386 |
+
"learning_rate": 8.372881355932205e-06,
|
12387 |
+
"loss": 1.4983,
|
12388 |
+
"step": 17650
|
12389 |
+
},
|
12390 |
+
{
|
12391 |
+
"epoch": 0.44,
|
12392 |
+
"grad_norm": 28.5,
|
12393 |
+
"learning_rate": 8.366101694915255e-06,
|
12394 |
+
"loss": 1.4799,
|
12395 |
+
"step": 17660
|
12396 |
+
},
|
12397 |
+
{
|
12398 |
+
"epoch": 0.44,
|
12399 |
+
"grad_norm": 34.25,
|
12400 |
+
"learning_rate": 8.359322033898306e-06,
|
12401 |
+
"loss": 1.4551,
|
12402 |
+
"step": 17670
|
12403 |
+
},
|
12404 |
+
{
|
12405 |
+
"epoch": 0.44,
|
12406 |
+
"grad_norm": 9.0625,
|
12407 |
+
"learning_rate": 8.352542372881357e-06,
|
12408 |
+
"loss": 1.3426,
|
12409 |
+
"step": 17680
|
12410 |
+
},
|
12411 |
+
{
|
12412 |
+
"epoch": 0.44,
|
12413 |
+
"grad_norm": 24.0,
|
12414 |
+
"learning_rate": 8.345762711864409e-06,
|
12415 |
+
"loss": 1.0709,
|
12416 |
+
"step": 17690
|
12417 |
+
},
|
12418 |
+
{
|
12419 |
+
"epoch": 0.44,
|
12420 |
+
"grad_norm": 21.75,
|
12421 |
+
"learning_rate": 8.338983050847458e-06,
|
12422 |
+
"loss": 1.2716,
|
12423 |
+
"step": 17700
|
12424 |
+
},
|
12425 |
+
{
|
12426 |
+
"epoch": 0.44,
|
12427 |
+
"grad_norm": 22.125,
|
12428 |
+
"learning_rate": 8.332203389830508e-06,
|
12429 |
+
"loss": 1.258,
|
12430 |
+
"step": 17710
|
12431 |
+
},
|
12432 |
+
{
|
12433 |
+
"epoch": 0.44,
|
12434 |
+
"grad_norm": 22.375,
|
12435 |
+
"learning_rate": 8.32542372881356e-06,
|
12436 |
+
"loss": 1.4624,
|
12437 |
+
"step": 17720
|
12438 |
+
},
|
12439 |
+
{
|
12440 |
+
"epoch": 0.44,
|
12441 |
+
"grad_norm": 20.25,
|
12442 |
+
"learning_rate": 8.318644067796611e-06,
|
12443 |
+
"loss": 1.1973,
|
12444 |
+
"step": 17730
|
12445 |
+
},
|
12446 |
+
{
|
12447 |
+
"epoch": 0.44,
|
12448 |
+
"grad_norm": 23.625,
|
12449 |
+
"learning_rate": 8.311864406779662e-06,
|
12450 |
+
"loss": 1.3575,
|
12451 |
+
"step": 17740
|
12452 |
+
},
|
12453 |
+
{
|
12454 |
+
"epoch": 0.44,
|
12455 |
+
"grad_norm": 21.75,
|
12456 |
+
"learning_rate": 8.305084745762712e-06,
|
12457 |
+
"loss": 1.4038,
|
12458 |
+
"step": 17750
|
12459 |
+
},
|
12460 |
+
{
|
12461 |
+
"epoch": 0.44,
|
12462 |
+
"grad_norm": 13.875,
|
12463 |
+
"learning_rate": 8.298305084745763e-06,
|
12464 |
+
"loss": 1.2004,
|
12465 |
+
"step": 17760
|
12466 |
+
},
|
12467 |
+
{
|
12468 |
+
"epoch": 0.44,
|
12469 |
+
"grad_norm": 13.125,
|
12470 |
+
"learning_rate": 8.291525423728815e-06,
|
12471 |
+
"loss": 1.4105,
|
12472 |
+
"step": 17770
|
12473 |
+
},
|
12474 |
+
{
|
12475 |
+
"epoch": 0.44,
|
12476 |
+
"grad_norm": 23.75,
|
12477 |
+
"learning_rate": 8.284745762711866e-06,
|
12478 |
+
"loss": 1.4087,
|
12479 |
+
"step": 17780
|
12480 |
+
},
|
12481 |
+
{
|
12482 |
+
"epoch": 0.44,
|
12483 |
+
"grad_norm": 17.5,
|
12484 |
+
"learning_rate": 8.277966101694916e-06,
|
12485 |
+
"loss": 1.3011,
|
12486 |
+
"step": 17790
|
12487 |
+
},
|
12488 |
+
{
|
12489 |
+
"epoch": 0.45,
|
12490 |
+
"grad_norm": 22.375,
|
12491 |
+
"learning_rate": 8.271186440677966e-06,
|
12492 |
+
"loss": 1.3587,
|
12493 |
+
"step": 17800
|
12494 |
+
},
|
12495 |
+
{
|
12496 |
+
"epoch": 0.45,
|
12497 |
+
"grad_norm": 21.25,
|
12498 |
+
"learning_rate": 8.264406779661017e-06,
|
12499 |
+
"loss": 1.3222,
|
12500 |
+
"step": 17810
|
12501 |
+
},
|
12502 |
+
{
|
12503 |
+
"epoch": 0.45,
|
12504 |
+
"grad_norm": 17.625,
|
12505 |
+
"learning_rate": 8.257627118644068e-06,
|
12506 |
+
"loss": 1.314,
|
12507 |
+
"step": 17820
|
12508 |
+
},
|
12509 |
+
{
|
12510 |
+
"epoch": 0.45,
|
12511 |
+
"grad_norm": 18.875,
|
12512 |
+
"learning_rate": 8.25084745762712e-06,
|
12513 |
+
"loss": 1.2684,
|
12514 |
+
"step": 17830
|
12515 |
+
},
|
12516 |
+
{
|
12517 |
+
"epoch": 0.45,
|
12518 |
+
"grad_norm": 14.125,
|
12519 |
+
"learning_rate": 8.24406779661017e-06,
|
12520 |
+
"loss": 1.3755,
|
12521 |
+
"step": 17840
|
12522 |
+
},
|
12523 |
+
{
|
12524 |
+
"epoch": 0.45,
|
12525 |
+
"grad_norm": 36.0,
|
12526 |
+
"learning_rate": 8.237288135593221e-06,
|
12527 |
+
"loss": 1.4589,
|
12528 |
+
"step": 17850
|
12529 |
+
},
|
12530 |
+
{
|
12531 |
+
"epoch": 0.45,
|
12532 |
+
"grad_norm": 49.0,
|
12533 |
+
"learning_rate": 8.230508474576272e-06,
|
12534 |
+
"loss": 1.3277,
|
12535 |
+
"step": 17860
|
12536 |
+
},
|
12537 |
+
{
|
12538 |
+
"epoch": 0.45,
|
12539 |
+
"grad_norm": 15.5,
|
12540 |
+
"learning_rate": 8.223728813559324e-06,
|
12541 |
+
"loss": 1.1487,
|
12542 |
+
"step": 17870
|
12543 |
+
},
|
12544 |
+
{
|
12545 |
+
"epoch": 0.45,
|
12546 |
+
"grad_norm": 55.0,
|
12547 |
+
"learning_rate": 8.216949152542373e-06,
|
12548 |
+
"loss": 1.3018,
|
12549 |
+
"step": 17880
|
12550 |
+
},
|
12551 |
+
{
|
12552 |
+
"epoch": 0.45,
|
12553 |
+
"grad_norm": 12.75,
|
12554 |
+
"learning_rate": 8.210169491525425e-06,
|
12555 |
+
"loss": 1.5137,
|
12556 |
+
"step": 17890
|
12557 |
+
},
|
12558 |
+
{
|
12559 |
+
"epoch": 0.45,
|
12560 |
+
"grad_norm": 14.4375,
|
12561 |
+
"learning_rate": 8.203389830508475e-06,
|
12562 |
+
"loss": 1.412,
|
12563 |
+
"step": 17900
|
12564 |
+
},
|
12565 |
+
{
|
12566 |
+
"epoch": 0.45,
|
12567 |
+
"grad_norm": 14.8125,
|
12568 |
+
"learning_rate": 8.196610169491526e-06,
|
12569 |
+
"loss": 1.3836,
|
12570 |
+
"step": 17910
|
12571 |
+
},
|
12572 |
+
{
|
12573 |
+
"epoch": 0.45,
|
12574 |
+
"grad_norm": 30.0,
|
12575 |
+
"learning_rate": 8.189830508474577e-06,
|
12576 |
+
"loss": 1.3612,
|
12577 |
+
"step": 17920
|
12578 |
+
},
|
12579 |
+
{
|
12580 |
+
"epoch": 0.45,
|
12581 |
+
"grad_norm": 56.0,
|
12582 |
+
"learning_rate": 8.183050847457627e-06,
|
12583 |
+
"loss": 1.3871,
|
12584 |
+
"step": 17930
|
12585 |
+
},
|
12586 |
+
{
|
12587 |
+
"epoch": 0.45,
|
12588 |
+
"grad_norm": 12.6875,
|
12589 |
+
"learning_rate": 8.176271186440678e-06,
|
12590 |
+
"loss": 1.335,
|
12591 |
+
"step": 17940
|
12592 |
+
},
|
12593 |
+
{
|
12594 |
+
"epoch": 0.45,
|
12595 |
+
"grad_norm": 38.75,
|
12596 |
+
"learning_rate": 8.16949152542373e-06,
|
12597 |
+
"loss": 1.3483,
|
12598 |
+
"step": 17950
|
12599 |
+
},
|
12600 |
+
{
|
12601 |
+
"epoch": 0.45,
|
12602 |
+
"grad_norm": 17.5,
|
12603 |
+
"learning_rate": 8.162711864406781e-06,
|
12604 |
+
"loss": 1.3832,
|
12605 |
+
"step": 17960
|
12606 |
+
},
|
12607 |
+
{
|
12608 |
+
"epoch": 0.45,
|
12609 |
+
"grad_norm": 78.5,
|
12610 |
+
"learning_rate": 8.155932203389831e-06,
|
12611 |
+
"loss": 1.6168,
|
12612 |
+
"step": 17970
|
12613 |
+
},
|
12614 |
+
{
|
12615 |
+
"epoch": 0.45,
|
12616 |
+
"grad_norm": 18.5,
|
12617 |
+
"learning_rate": 8.149152542372882e-06,
|
12618 |
+
"loss": 1.2728,
|
12619 |
+
"step": 17980
|
12620 |
+
},
|
12621 |
+
{
|
12622 |
+
"epoch": 0.45,
|
12623 |
+
"grad_norm": 19.5,
|
12624 |
+
"learning_rate": 8.142372881355934e-06,
|
12625 |
+
"loss": 1.3424,
|
12626 |
+
"step": 17990
|
12627 |
+
},
|
12628 |
+
{
|
12629 |
+
"epoch": 0.45,
|
12630 |
+
"grad_norm": 22.5,
|
12631 |
+
"learning_rate": 8.135593220338983e-06,
|
12632 |
+
"loss": 1.2738,
|
12633 |
+
"step": 18000
|
12634 |
+
},
|
12635 |
+
{
|
12636 |
+
"epoch": 0.45,
|
12637 |
+
"grad_norm": 9.9375,
|
12638 |
+
"learning_rate": 8.128813559322035e-06,
|
12639 |
+
"loss": 1.2798,
|
12640 |
+
"step": 18010
|
12641 |
+
},
|
12642 |
+
{
|
12643 |
+
"epoch": 0.45,
|
12644 |
+
"grad_norm": 19.625,
|
12645 |
+
"learning_rate": 8.122033898305085e-06,
|
12646 |
+
"loss": 1.4365,
|
12647 |
+
"step": 18020
|
12648 |
+
},
|
12649 |
+
{
|
12650 |
+
"epoch": 0.45,
|
12651 |
+
"grad_norm": 12.9375,
|
12652 |
+
"learning_rate": 8.115254237288136e-06,
|
12653 |
+
"loss": 1.5073,
|
12654 |
+
"step": 18030
|
12655 |
+
},
|
12656 |
+
{
|
12657 |
+
"epoch": 0.45,
|
12658 |
+
"grad_norm": 23.0,
|
12659 |
+
"learning_rate": 8.108474576271187e-06,
|
12660 |
+
"loss": 1.3122,
|
12661 |
+
"step": 18040
|
12662 |
+
},
|
12663 |
+
{
|
12664 |
+
"epoch": 0.45,
|
12665 |
+
"grad_norm": 22.75,
|
12666 |
+
"learning_rate": 8.101694915254237e-06,
|
12667 |
+
"loss": 1.4867,
|
12668 |
+
"step": 18050
|
12669 |
+
},
|
12670 |
+
{
|
12671 |
+
"epoch": 0.45,
|
12672 |
+
"grad_norm": 33.5,
|
12673 |
+
"learning_rate": 8.094915254237289e-06,
|
12674 |
+
"loss": 1.3197,
|
12675 |
+
"step": 18060
|
12676 |
+
},
|
12677 |
+
{
|
12678 |
+
"epoch": 0.45,
|
12679 |
+
"grad_norm": 25.25,
|
12680 |
+
"learning_rate": 8.08813559322034e-06,
|
12681 |
+
"loss": 1.1822,
|
12682 |
+
"step": 18070
|
12683 |
+
},
|
12684 |
+
{
|
12685 |
+
"epoch": 0.45,
|
12686 |
+
"grad_norm": 28.25,
|
12687 |
+
"learning_rate": 8.081355932203391e-06,
|
12688 |
+
"loss": 1.3108,
|
12689 |
+
"step": 18080
|
12690 |
+
},
|
12691 |
+
{
|
12692 |
+
"epoch": 0.45,
|
12693 |
+
"grad_norm": 49.75,
|
12694 |
+
"learning_rate": 8.074576271186441e-06,
|
12695 |
+
"loss": 1.4321,
|
12696 |
+
"step": 18090
|
12697 |
+
},
|
12698 |
+
{
|
12699 |
+
"epoch": 0.45,
|
12700 |
+
"grad_norm": 30.625,
|
12701 |
+
"learning_rate": 8.067796610169492e-06,
|
12702 |
+
"loss": 1.3081,
|
12703 |
+
"step": 18100
|
12704 |
+
},
|
12705 |
+
{
|
12706 |
+
"epoch": 0.45,
|
12707 |
+
"grad_norm": 10.375,
|
12708 |
+
"learning_rate": 8.061016949152542e-06,
|
12709 |
+
"loss": 1.2814,
|
12710 |
+
"step": 18110
|
12711 |
+
},
|
12712 |
+
{
|
12713 |
+
"epoch": 0.45,
|
12714 |
+
"grad_norm": 26.0,
|
12715 |
+
"learning_rate": 8.054237288135594e-06,
|
12716 |
+
"loss": 1.2951,
|
12717 |
+
"step": 18120
|
12718 |
+
},
|
12719 |
+
{
|
12720 |
+
"epoch": 0.45,
|
12721 |
+
"grad_norm": 59.25,
|
12722 |
+
"learning_rate": 8.047457627118645e-06,
|
12723 |
+
"loss": 1.3096,
|
12724 |
+
"step": 18130
|
12725 |
+
},
|
12726 |
+
{
|
12727 |
+
"epoch": 0.45,
|
12728 |
+
"grad_norm": 29.625,
|
12729 |
+
"learning_rate": 8.040677966101695e-06,
|
12730 |
+
"loss": 1.3775,
|
12731 |
+
"step": 18140
|
12732 |
+
},
|
12733 |
+
{
|
12734 |
+
"epoch": 0.45,
|
12735 |
+
"grad_norm": 36.0,
|
12736 |
+
"learning_rate": 8.033898305084746e-06,
|
12737 |
+
"loss": 1.2384,
|
12738 |
+
"step": 18150
|
12739 |
+
},
|
12740 |
+
{
|
12741 |
+
"epoch": 0.45,
|
12742 |
+
"grad_norm": 23.875,
|
12743 |
+
"learning_rate": 8.027118644067797e-06,
|
12744 |
+
"loss": 1.4262,
|
12745 |
+
"step": 18160
|
12746 |
+
},
|
12747 |
+
{
|
12748 |
+
"epoch": 0.45,
|
12749 |
+
"grad_norm": 26.125,
|
12750 |
+
"learning_rate": 8.020338983050849e-06,
|
12751 |
+
"loss": 1.1021,
|
12752 |
+
"step": 18170
|
12753 |
+
},
|
12754 |
+
{
|
12755 |
+
"epoch": 0.45,
|
12756 |
+
"grad_norm": 22.75,
|
12757 |
+
"learning_rate": 8.013559322033899e-06,
|
12758 |
+
"loss": 1.3957,
|
12759 |
+
"step": 18180
|
12760 |
+
},
|
12761 |
+
{
|
12762 |
+
"epoch": 0.45,
|
12763 |
+
"grad_norm": 27.875,
|
12764 |
+
"learning_rate": 8.00677966101695e-06,
|
12765 |
+
"loss": 1.4021,
|
12766 |
+
"step": 18190
|
12767 |
+
},
|
12768 |
+
{
|
12769 |
+
"epoch": 0.46,
|
12770 |
+
"grad_norm": 22.25,
|
12771 |
+
"learning_rate": 8.000000000000001e-06,
|
12772 |
+
"loss": 1.4371,
|
12773 |
+
"step": 18200
|
12774 |
+
},
|
12775 |
+
{
|
12776 |
+
"epoch": 0.46,
|
12777 |
+
"grad_norm": 30.5,
|
12778 |
+
"learning_rate": 7.993220338983053e-06,
|
12779 |
+
"loss": 1.3273,
|
12780 |
+
"step": 18210
|
12781 |
+
},
|
12782 |
+
{
|
12783 |
+
"epoch": 0.46,
|
12784 |
+
"grad_norm": 24.375,
|
12785 |
+
"learning_rate": 7.986440677966102e-06,
|
12786 |
+
"loss": 1.3039,
|
12787 |
+
"step": 18220
|
12788 |
+
},
|
12789 |
+
{
|
12790 |
+
"epoch": 0.46,
|
12791 |
+
"grad_norm": 18.125,
|
12792 |
+
"learning_rate": 7.979661016949152e-06,
|
12793 |
+
"loss": 1.2717,
|
12794 |
+
"step": 18230
|
12795 |
+
},
|
12796 |
+
{
|
12797 |
+
"epoch": 0.46,
|
12798 |
+
"grad_norm": 62.75,
|
12799 |
+
"learning_rate": 7.972881355932204e-06,
|
12800 |
+
"loss": 1.4438,
|
12801 |
+
"step": 18240
|
12802 |
+
},
|
12803 |
+
{
|
12804 |
+
"epoch": 0.46,
|
12805 |
+
"grad_norm": 47.0,
|
12806 |
+
"learning_rate": 7.966101694915255e-06,
|
12807 |
+
"loss": 1.5599,
|
12808 |
+
"step": 18250
|
12809 |
+
},
|
12810 |
+
{
|
12811 |
+
"epoch": 0.46,
|
12812 |
+
"grad_norm": 11.0,
|
12813 |
+
"learning_rate": 7.959322033898306e-06,
|
12814 |
+
"loss": 1.3485,
|
12815 |
+
"step": 18260
|
12816 |
+
},
|
12817 |
+
{
|
12818 |
+
"epoch": 0.46,
|
12819 |
+
"grad_norm": 16.875,
|
12820 |
+
"learning_rate": 7.952542372881356e-06,
|
12821 |
+
"loss": 1.428,
|
12822 |
+
"step": 18270
|
12823 |
+
},
|
12824 |
+
{
|
12825 |
+
"epoch": 0.46,
|
12826 |
+
"grad_norm": 35.5,
|
12827 |
+
"learning_rate": 7.945762711864407e-06,
|
12828 |
+
"loss": 1.3857,
|
12829 |
+
"step": 18280
|
12830 |
+
},
|
12831 |
+
{
|
12832 |
+
"epoch": 0.46,
|
12833 |
+
"grad_norm": 30.25,
|
12834 |
+
"learning_rate": 7.938983050847459e-06,
|
12835 |
+
"loss": 1.1532,
|
12836 |
+
"step": 18290
|
12837 |
+
},
|
12838 |
+
{
|
12839 |
+
"epoch": 0.46,
|
12840 |
+
"grad_norm": 22.375,
|
12841 |
+
"learning_rate": 7.93220338983051e-06,
|
12842 |
+
"loss": 1.2357,
|
12843 |
+
"step": 18300
|
12844 |
+
},
|
12845 |
+
{
|
12846 |
+
"epoch": 0.46,
|
12847 |
+
"grad_norm": 29.25,
|
12848 |
+
"learning_rate": 7.92542372881356e-06,
|
12849 |
+
"loss": 1.337,
|
12850 |
+
"step": 18310
|
12851 |
+
},
|
12852 |
+
{
|
12853 |
+
"epoch": 0.46,
|
12854 |
+
"grad_norm": 13.3125,
|
12855 |
+
"learning_rate": 7.91864406779661e-06,
|
12856 |
+
"loss": 1.3956,
|
12857 |
+
"step": 18320
|
12858 |
+
},
|
12859 |
+
{
|
12860 |
+
"epoch": 0.46,
|
12861 |
+
"grad_norm": 36.75,
|
12862 |
+
"learning_rate": 7.911864406779661e-06,
|
12863 |
+
"loss": 1.2387,
|
12864 |
+
"step": 18330
|
12865 |
+
},
|
12866 |
+
{
|
12867 |
+
"epoch": 0.46,
|
12868 |
+
"grad_norm": 51.75,
|
12869 |
+
"learning_rate": 7.905084745762712e-06,
|
12870 |
+
"loss": 1.305,
|
12871 |
+
"step": 18340
|
12872 |
+
},
|
12873 |
+
{
|
12874 |
+
"epoch": 0.46,
|
12875 |
+
"grad_norm": 28.625,
|
12876 |
+
"learning_rate": 7.898305084745764e-06,
|
12877 |
+
"loss": 1.2429,
|
12878 |
+
"step": 18350
|
12879 |
+
},
|
12880 |
+
{
|
12881 |
+
"epoch": 0.46,
|
12882 |
+
"grad_norm": 24.375,
|
12883 |
+
"learning_rate": 7.891525423728814e-06,
|
12884 |
+
"loss": 1.3913,
|
12885 |
+
"step": 18360
|
12886 |
+
},
|
12887 |
+
{
|
12888 |
+
"epoch": 0.46,
|
12889 |
+
"grad_norm": 20.125,
|
12890 |
+
"learning_rate": 7.884745762711865e-06,
|
12891 |
+
"loss": 1.5295,
|
12892 |
+
"step": 18370
|
12893 |
+
},
|
12894 |
+
{
|
12895 |
+
"epoch": 0.46,
|
12896 |
+
"grad_norm": 25.625,
|
12897 |
+
"learning_rate": 7.877966101694916e-06,
|
12898 |
+
"loss": 1.3455,
|
12899 |
+
"step": 18380
|
12900 |
+
},
|
12901 |
+
{
|
12902 |
+
"epoch": 0.46,
|
12903 |
+
"grad_norm": 17.375,
|
12904 |
+
"learning_rate": 7.871186440677968e-06,
|
12905 |
+
"loss": 1.5508,
|
12906 |
+
"step": 18390
|
12907 |
+
},
|
12908 |
+
{
|
12909 |
+
"epoch": 0.46,
|
12910 |
+
"grad_norm": 71.0,
|
12911 |
+
"learning_rate": 7.864406779661017e-06,
|
12912 |
+
"loss": 1.2648,
|
12913 |
+
"step": 18400
|
12914 |
+
},
|
12915 |
+
{
|
12916 |
+
"epoch": 0.46,
|
12917 |
+
"grad_norm": 28.875,
|
12918 |
+
"learning_rate": 7.857627118644069e-06,
|
12919 |
+
"loss": 1.2369,
|
12920 |
+
"step": 18410
|
12921 |
+
},
|
12922 |
+
{
|
12923 |
+
"epoch": 0.46,
|
12924 |
+
"grad_norm": 18.25,
|
12925 |
+
"learning_rate": 7.850847457627119e-06,
|
12926 |
+
"loss": 1.5172,
|
12927 |
+
"step": 18420
|
12928 |
+
},
|
12929 |
+
{
|
12930 |
+
"epoch": 0.46,
|
12931 |
+
"grad_norm": 20.625,
|
12932 |
+
"learning_rate": 7.84406779661017e-06,
|
12933 |
+
"loss": 1.3711,
|
12934 |
+
"step": 18430
|
12935 |
+
},
|
12936 |
+
{
|
12937 |
+
"epoch": 0.46,
|
12938 |
+
"grad_norm": 12.6875,
|
12939 |
+
"learning_rate": 7.837288135593221e-06,
|
12940 |
+
"loss": 1.4377,
|
12941 |
+
"step": 18440
|
12942 |
+
},
|
12943 |
+
{
|
12944 |
+
"epoch": 0.46,
|
12945 |
+
"grad_norm": 28.5,
|
12946 |
+
"learning_rate": 7.830508474576271e-06,
|
12947 |
+
"loss": 1.303,
|
12948 |
+
"step": 18450
|
12949 |
+
},
|
12950 |
+
{
|
12951 |
+
"epoch": 0.46,
|
12952 |
+
"grad_norm": 18.125,
|
12953 |
+
"learning_rate": 7.823728813559322e-06,
|
12954 |
+
"loss": 1.2747,
|
12955 |
+
"step": 18460
|
12956 |
+
},
|
12957 |
+
{
|
12958 |
+
"epoch": 0.46,
|
12959 |
+
"grad_norm": 29.75,
|
12960 |
+
"learning_rate": 7.816949152542374e-06,
|
12961 |
+
"loss": 1.3278,
|
12962 |
+
"step": 18470
|
12963 |
+
},
|
12964 |
+
{
|
12965 |
+
"epoch": 0.46,
|
12966 |
+
"grad_norm": 17.25,
|
12967 |
+
"learning_rate": 7.810169491525425e-06,
|
12968 |
+
"loss": 1.558,
|
12969 |
+
"step": 18480
|
12970 |
+
},
|
12971 |
+
{
|
12972 |
+
"epoch": 0.46,
|
12973 |
+
"grad_norm": 10.625,
|
12974 |
+
"learning_rate": 7.803389830508475e-06,
|
12975 |
+
"loss": 1.4647,
|
12976 |
+
"step": 18490
|
12977 |
+
},
|
12978 |
+
{
|
12979 |
+
"epoch": 0.46,
|
12980 |
+
"grad_norm": 21.25,
|
12981 |
+
"learning_rate": 7.796610169491526e-06,
|
12982 |
+
"loss": 1.3819,
|
12983 |
+
"step": 18500
|
12984 |
+
},
|
12985 |
+
{
|
12986 |
+
"epoch": 0.46,
|
12987 |
+
"grad_norm": 13.875,
|
12988 |
+
"learning_rate": 7.789830508474578e-06,
|
12989 |
+
"loss": 1.3907,
|
12990 |
+
"step": 18510
|
12991 |
+
},
|
12992 |
+
{
|
12993 |
+
"epoch": 0.46,
|
12994 |
+
"grad_norm": 20.375,
|
12995 |
+
"learning_rate": 7.783050847457628e-06,
|
12996 |
+
"loss": 1.3471,
|
12997 |
+
"step": 18520
|
12998 |
+
},
|
12999 |
+
{
|
13000 |
+
"epoch": 0.46,
|
13001 |
+
"grad_norm": 18.5,
|
13002 |
+
"learning_rate": 7.776271186440679e-06,
|
13003 |
+
"loss": 1.5287,
|
13004 |
+
"step": 18530
|
13005 |
+
},
|
13006 |
+
{
|
13007 |
+
"epoch": 0.46,
|
13008 |
+
"grad_norm": 21.25,
|
13009 |
+
"learning_rate": 7.769491525423729e-06,
|
13010 |
+
"loss": 1.4263,
|
13011 |
+
"step": 18540
|
13012 |
+
},
|
13013 |
+
{
|
13014 |
+
"epoch": 0.46,
|
13015 |
+
"grad_norm": 10.125,
|
13016 |
+
"learning_rate": 7.76271186440678e-06,
|
13017 |
+
"loss": 1.1069,
|
13018 |
+
"step": 18550
|
13019 |
+
},
|
13020 |
+
{
|
13021 |
+
"epoch": 0.46,
|
13022 |
+
"grad_norm": 13.1875,
|
13023 |
+
"learning_rate": 7.755932203389831e-06,
|
13024 |
+
"loss": 1.2873,
|
13025 |
+
"step": 18560
|
13026 |
+
},
|
13027 |
+
{
|
13028 |
+
"epoch": 0.46,
|
13029 |
+
"grad_norm": 45.5,
|
13030 |
+
"learning_rate": 7.749152542372881e-06,
|
13031 |
+
"loss": 1.3418,
|
13032 |
+
"step": 18570
|
13033 |
+
},
|
13034 |
+
{
|
13035 |
+
"epoch": 0.46,
|
13036 |
+
"grad_norm": 15.625,
|
13037 |
+
"learning_rate": 7.742372881355933e-06,
|
13038 |
+
"loss": 1.2194,
|
13039 |
+
"step": 18580
|
13040 |
+
},
|
13041 |
+
{
|
13042 |
+
"epoch": 0.46,
|
13043 |
+
"grad_norm": 18.25,
|
13044 |
+
"learning_rate": 7.735593220338984e-06,
|
13045 |
+
"loss": 1.3678,
|
13046 |
+
"step": 18590
|
13047 |
+
},
|
13048 |
+
{
|
13049 |
+
"epoch": 0.47,
|
13050 |
+
"grad_norm": 22.375,
|
13051 |
+
"learning_rate": 7.728813559322035e-06,
|
13052 |
+
"loss": 1.3842,
|
13053 |
+
"step": 18600
|
13054 |
+
},
|
13055 |
+
{
|
13056 |
+
"epoch": 0.47,
|
13057 |
+
"grad_norm": 35.75,
|
13058 |
+
"learning_rate": 7.722033898305085e-06,
|
13059 |
+
"loss": 1.1206,
|
13060 |
+
"step": 18610
|
13061 |
+
},
|
13062 |
+
{
|
13063 |
+
"epoch": 0.47,
|
13064 |
+
"grad_norm": 30.25,
|
13065 |
+
"learning_rate": 7.715254237288136e-06,
|
13066 |
+
"loss": 1.3278,
|
13067 |
+
"step": 18620
|
13068 |
+
},
|
13069 |
+
{
|
13070 |
+
"epoch": 0.47,
|
13071 |
+
"grad_norm": 11.75,
|
13072 |
+
"learning_rate": 7.708474576271186e-06,
|
13073 |
+
"loss": 1.3577,
|
13074 |
+
"step": 18630
|
13075 |
+
},
|
13076 |
+
{
|
13077 |
+
"epoch": 0.47,
|
13078 |
+
"grad_norm": 22.875,
|
13079 |
+
"learning_rate": 7.701694915254238e-06,
|
13080 |
+
"loss": 1.4379,
|
13081 |
+
"step": 18640
|
13082 |
+
},
|
13083 |
+
{
|
13084 |
+
"epoch": 0.47,
|
13085 |
+
"grad_norm": 15.75,
|
13086 |
+
"learning_rate": 7.694915254237289e-06,
|
13087 |
+
"loss": 1.4008,
|
13088 |
+
"step": 18650
|
13089 |
+
},
|
13090 |
+
{
|
13091 |
+
"epoch": 0.47,
|
13092 |
+
"grad_norm": 30.25,
|
13093 |
+
"learning_rate": 7.688135593220339e-06,
|
13094 |
+
"loss": 1.3135,
|
13095 |
+
"step": 18660
|
13096 |
+
},
|
13097 |
+
{
|
13098 |
+
"epoch": 0.47,
|
13099 |
+
"grad_norm": 18.0,
|
13100 |
+
"learning_rate": 7.68135593220339e-06,
|
13101 |
+
"loss": 1.3952,
|
13102 |
+
"step": 18670
|
13103 |
+
},
|
13104 |
+
{
|
13105 |
+
"epoch": 0.47,
|
13106 |
+
"grad_norm": 16.875,
|
13107 |
+
"learning_rate": 7.674576271186441e-06,
|
13108 |
+
"loss": 1.3399,
|
13109 |
+
"step": 18680
|
13110 |
+
},
|
13111 |
+
{
|
13112 |
+
"epoch": 0.47,
|
13113 |
+
"grad_norm": 27.25,
|
13114 |
+
"learning_rate": 7.667796610169493e-06,
|
13115 |
+
"loss": 1.2071,
|
13116 |
+
"step": 18690
|
13117 |
+
},
|
13118 |
+
{
|
13119 |
+
"epoch": 0.47,
|
13120 |
+
"grad_norm": 20.875,
|
13121 |
+
"learning_rate": 7.661016949152543e-06,
|
13122 |
+
"loss": 1.3875,
|
13123 |
+
"step": 18700
|
13124 |
+
},
|
13125 |
+
{
|
13126 |
+
"epoch": 0.47,
|
13127 |
+
"grad_norm": 70.0,
|
13128 |
+
"learning_rate": 7.654237288135594e-06,
|
13129 |
+
"loss": 1.2638,
|
13130 |
+
"step": 18710
|
13131 |
+
},
|
13132 |
+
{
|
13133 |
+
"epoch": 0.47,
|
13134 |
+
"grad_norm": 20.0,
|
13135 |
+
"learning_rate": 7.647457627118645e-06,
|
13136 |
+
"loss": 1.4459,
|
13137 |
+
"step": 18720
|
13138 |
+
},
|
13139 |
+
{
|
13140 |
+
"epoch": 0.47,
|
13141 |
+
"grad_norm": 17.125,
|
13142 |
+
"learning_rate": 7.640677966101695e-06,
|
13143 |
+
"loss": 1.4673,
|
13144 |
+
"step": 18730
|
13145 |
+
},
|
13146 |
+
{
|
13147 |
+
"epoch": 0.47,
|
13148 |
+
"grad_norm": 22.625,
|
13149 |
+
"learning_rate": 7.633898305084746e-06,
|
13150 |
+
"loss": 1.464,
|
13151 |
+
"step": 18740
|
13152 |
+
},
|
13153 |
+
{
|
13154 |
+
"epoch": 0.47,
|
13155 |
+
"grad_norm": 11.0,
|
13156 |
+
"learning_rate": 7.627118644067797e-06,
|
13157 |
+
"loss": 1.3678,
|
13158 |
+
"step": 18750
|
13159 |
+
},
|
13160 |
+
{
|
13161 |
+
"epoch": 0.47,
|
13162 |
+
"grad_norm": 39.5,
|
13163 |
+
"learning_rate": 7.6203389830508476e-06,
|
13164 |
+
"loss": 1.4107,
|
13165 |
+
"step": 18760
|
13166 |
+
},
|
13167 |
+
{
|
13168 |
+
"epoch": 0.47,
|
13169 |
+
"grad_norm": 18.25,
|
13170 |
+
"learning_rate": 7.613559322033899e-06,
|
13171 |
+
"loss": 1.2465,
|
13172 |
+
"step": 18770
|
13173 |
+
},
|
13174 |
+
{
|
13175 |
+
"epoch": 0.47,
|
13176 |
+
"grad_norm": 43.75,
|
13177 |
+
"learning_rate": 7.6067796610169495e-06,
|
13178 |
+
"loss": 1.3577,
|
13179 |
+
"step": 18780
|
13180 |
+
},
|
13181 |
+
{
|
13182 |
+
"epoch": 0.47,
|
13183 |
+
"grad_norm": 24.0,
|
13184 |
+
"learning_rate": 7.600000000000001e-06,
|
13185 |
+
"loss": 1.3036,
|
13186 |
+
"step": 18790
|
13187 |
+
},
|
13188 |
+
{
|
13189 |
+
"epoch": 0.47,
|
13190 |
+
"grad_norm": 16.625,
|
13191 |
+
"learning_rate": 7.5932203389830515e-06,
|
13192 |
+
"loss": 1.3213,
|
13193 |
+
"step": 18800
|
13194 |
+
},
|
13195 |
+
{
|
13196 |
+
"epoch": 0.47,
|
13197 |
+
"grad_norm": 44.0,
|
13198 |
+
"learning_rate": 7.586440677966103e-06,
|
13199 |
+
"loss": 1.378,
|
13200 |
+
"step": 18810
|
13201 |
+
},
|
13202 |
+
{
|
13203 |
+
"epoch": 0.47,
|
13204 |
+
"grad_norm": 11.1875,
|
13205 |
+
"learning_rate": 7.5796610169491534e-06,
|
13206 |
+
"loss": 1.2166,
|
13207 |
+
"step": 18820
|
13208 |
+
},
|
13209 |
+
{
|
13210 |
+
"epoch": 0.47,
|
13211 |
+
"grad_norm": 16.75,
|
13212 |
+
"learning_rate": 7.572881355932205e-06,
|
13213 |
+
"loss": 1.3052,
|
13214 |
+
"step": 18830
|
13215 |
+
},
|
13216 |
+
{
|
13217 |
+
"epoch": 0.47,
|
13218 |
+
"grad_norm": 47.75,
|
13219 |
+
"learning_rate": 7.5661016949152545e-06,
|
13220 |
+
"loss": 1.2388,
|
13221 |
+
"step": 18840
|
13222 |
+
},
|
13223 |
+
{
|
13224 |
+
"epoch": 0.47,
|
13225 |
+
"grad_norm": 17.25,
|
13226 |
+
"learning_rate": 7.559322033898305e-06,
|
13227 |
+
"loss": 1.5168,
|
13228 |
+
"step": 18850
|
13229 |
+
},
|
13230 |
+
{
|
13231 |
+
"epoch": 0.47,
|
13232 |
+
"grad_norm": 42.75,
|
13233 |
+
"learning_rate": 7.5525423728813565e-06,
|
13234 |
+
"loss": 1.062,
|
13235 |
+
"step": 18860
|
13236 |
+
},
|
13237 |
+
{
|
13238 |
+
"epoch": 0.47,
|
13239 |
+
"grad_norm": 33.5,
|
13240 |
+
"learning_rate": 7.545762711864407e-06,
|
13241 |
+
"loss": 1.4678,
|
13242 |
+
"step": 18870
|
13243 |
+
},
|
13244 |
+
{
|
13245 |
+
"epoch": 0.47,
|
13246 |
+
"grad_norm": 35.0,
|
13247 |
+
"learning_rate": 7.5389830508474584e-06,
|
13248 |
+
"loss": 1.3447,
|
13249 |
+
"step": 18880
|
13250 |
+
},
|
13251 |
+
{
|
13252 |
+
"epoch": 0.47,
|
13253 |
+
"grad_norm": 18.5,
|
13254 |
+
"learning_rate": 7.532203389830509e-06,
|
13255 |
+
"loss": 1.3211,
|
13256 |
+
"step": 18890
|
13257 |
+
},
|
13258 |
+
{
|
13259 |
+
"epoch": 0.47,
|
13260 |
+
"grad_norm": 48.5,
|
13261 |
+
"learning_rate": 7.52542372881356e-06,
|
13262 |
+
"loss": 1.4629,
|
13263 |
+
"step": 18900
|
13264 |
+
},
|
13265 |
+
{
|
13266 |
+
"epoch": 0.47,
|
13267 |
+
"grad_norm": 17.625,
|
13268 |
+
"learning_rate": 7.518644067796611e-06,
|
13269 |
+
"loss": 1.3007,
|
13270 |
+
"step": 18910
|
13271 |
+
},
|
13272 |
+
{
|
13273 |
+
"epoch": 0.47,
|
13274 |
+
"grad_norm": 30.125,
|
13275 |
+
"learning_rate": 7.511864406779662e-06,
|
13276 |
+
"loss": 1.3632,
|
13277 |
+
"step": 18920
|
13278 |
+
},
|
13279 |
+
{
|
13280 |
+
"epoch": 0.47,
|
13281 |
+
"grad_norm": 40.0,
|
13282 |
+
"learning_rate": 7.505084745762713e-06,
|
13283 |
+
"loss": 1.3538,
|
13284 |
+
"step": 18930
|
13285 |
+
},
|
13286 |
+
{
|
13287 |
+
"epoch": 0.47,
|
13288 |
+
"grad_norm": 48.25,
|
13289 |
+
"learning_rate": 7.498305084745763e-06,
|
13290 |
+
"loss": 1.2519,
|
13291 |
+
"step": 18940
|
13292 |
+
},
|
13293 |
+
{
|
13294 |
+
"epoch": 0.47,
|
13295 |
+
"grad_norm": 34.0,
|
13296 |
+
"learning_rate": 7.491525423728814e-06,
|
13297 |
+
"loss": 1.1823,
|
13298 |
+
"step": 18950
|
13299 |
+
},
|
13300 |
+
{
|
13301 |
+
"epoch": 0.47,
|
13302 |
+
"grad_norm": 43.5,
|
13303 |
+
"learning_rate": 7.4847457627118646e-06,
|
13304 |
+
"loss": 1.2593,
|
13305 |
+
"step": 18960
|
13306 |
+
},
|
13307 |
+
{
|
13308 |
+
"epoch": 0.47,
|
13309 |
+
"grad_norm": 19.25,
|
13310 |
+
"learning_rate": 7.477966101694916e-06,
|
13311 |
+
"loss": 1.3109,
|
13312 |
+
"step": 18970
|
13313 |
+
},
|
13314 |
+
{
|
13315 |
+
"epoch": 0.47,
|
13316 |
+
"grad_norm": 43.0,
|
13317 |
+
"learning_rate": 7.4711864406779665e-06,
|
13318 |
+
"loss": 1.3609,
|
13319 |
+
"step": 18980
|
13320 |
+
},
|
13321 |
+
{
|
13322 |
+
"epoch": 0.47,
|
13323 |
+
"grad_norm": 23.0,
|
13324 |
+
"learning_rate": 7.464406779661018e-06,
|
13325 |
+
"loss": 1.366,
|
13326 |
+
"step": 18990
|
13327 |
+
},
|
13328 |
+
{
|
13329 |
+
"epoch": 0.47,
|
13330 |
+
"grad_norm": 25.75,
|
13331 |
+
"learning_rate": 7.4576271186440685e-06,
|
13332 |
+
"loss": 1.3806,
|
13333 |
+
"step": 19000
|
13334 |
+
},
|
13335 |
+
{
|
13336 |
+
"epoch": 0.48,
|
13337 |
+
"grad_norm": 10.75,
|
13338 |
+
"learning_rate": 7.45084745762712e-06,
|
13339 |
+
"loss": 1.2656,
|
13340 |
+
"step": 19010
|
13341 |
+
},
|
13342 |
+
{
|
13343 |
+
"epoch": 0.48,
|
13344 |
+
"grad_norm": 28.625,
|
13345 |
+
"learning_rate": 7.4440677966101704e-06,
|
13346 |
+
"loss": 1.3377,
|
13347 |
+
"step": 19020
|
13348 |
+
},
|
13349 |
+
{
|
13350 |
+
"epoch": 0.48,
|
13351 |
+
"grad_norm": 17.875,
|
13352 |
+
"learning_rate": 7.437288135593221e-06,
|
13353 |
+
"loss": 1.3975,
|
13354 |
+
"step": 19030
|
13355 |
+
},
|
13356 |
+
{
|
13357 |
+
"epoch": 0.48,
|
13358 |
+
"grad_norm": 45.0,
|
13359 |
+
"learning_rate": 7.430508474576272e-06,
|
13360 |
+
"loss": 1.1749,
|
13361 |
+
"step": 19040
|
13362 |
+
},
|
13363 |
+
{
|
13364 |
+
"epoch": 0.48,
|
13365 |
+
"grad_norm": 13.375,
|
13366 |
+
"learning_rate": 7.423728813559322e-06,
|
13367 |
+
"loss": 1.4238,
|
13368 |
+
"step": 19050
|
13369 |
+
},
|
13370 |
+
{
|
13371 |
+
"epoch": 0.48,
|
13372 |
+
"grad_norm": 10.9375,
|
13373 |
+
"learning_rate": 7.4169491525423735e-06,
|
13374 |
+
"loss": 1.366,
|
13375 |
+
"step": 19060
|
13376 |
+
},
|
13377 |
+
{
|
13378 |
+
"epoch": 0.48,
|
13379 |
+
"grad_norm": 40.25,
|
13380 |
+
"learning_rate": 7.410169491525424e-06,
|
13381 |
+
"loss": 1.4467,
|
13382 |
+
"step": 19070
|
13383 |
+
},
|
13384 |
+
{
|
13385 |
+
"epoch": 0.48,
|
13386 |
+
"grad_norm": 15.4375,
|
13387 |
+
"learning_rate": 7.4033898305084754e-06,
|
13388 |
+
"loss": 1.4484,
|
13389 |
+
"step": 19080
|
13390 |
+
},
|
13391 |
+
{
|
13392 |
+
"epoch": 0.48,
|
13393 |
+
"grad_norm": 12.8125,
|
13394 |
+
"learning_rate": 7.396610169491526e-06,
|
13395 |
+
"loss": 1.4081,
|
13396 |
+
"step": 19090
|
13397 |
+
},
|
13398 |
+
{
|
13399 |
+
"epoch": 0.48,
|
13400 |
+
"grad_norm": 30.0,
|
13401 |
+
"learning_rate": 7.3898305084745766e-06,
|
13402 |
+
"loss": 1.3336,
|
13403 |
+
"step": 19100
|
13404 |
+
},
|
13405 |
+
{
|
13406 |
+
"epoch": 0.48,
|
13407 |
+
"grad_norm": 36.5,
|
13408 |
+
"learning_rate": 7.383050847457628e-06,
|
13409 |
+
"loss": 1.3015,
|
13410 |
+
"step": 19110
|
13411 |
+
},
|
13412 |
+
{
|
13413 |
+
"epoch": 0.48,
|
13414 |
+
"grad_norm": 13.875,
|
13415 |
+
"learning_rate": 7.3762711864406785e-06,
|
13416 |
+
"loss": 1.4283,
|
13417 |
+
"step": 19120
|
13418 |
+
},
|
13419 |
+
{
|
13420 |
+
"epoch": 0.48,
|
13421 |
+
"grad_norm": 41.0,
|
13422 |
+
"learning_rate": 7.36949152542373e-06,
|
13423 |
+
"loss": 1.2612,
|
13424 |
+
"step": 19130
|
13425 |
+
},
|
13426 |
+
{
|
13427 |
+
"epoch": 0.48,
|
13428 |
+
"grad_norm": 18.0,
|
13429 |
+
"learning_rate": 7.3627118644067805e-06,
|
13430 |
+
"loss": 1.1913,
|
13431 |
+
"step": 19140
|
13432 |
+
},
|
13433 |
+
{
|
13434 |
+
"epoch": 0.48,
|
13435 |
+
"grad_norm": 19.375,
|
13436 |
+
"learning_rate": 7.355932203389831e-06,
|
13437 |
+
"loss": 1.338,
|
13438 |
+
"step": 19150
|
13439 |
+
},
|
13440 |
+
{
|
13441 |
+
"epoch": 0.48,
|
13442 |
+
"grad_norm": 51.5,
|
13443 |
+
"learning_rate": 7.3491525423728816e-06,
|
13444 |
+
"loss": 1.3061,
|
13445 |
+
"step": 19160
|
13446 |
+
},
|
13447 |
+
{
|
13448 |
+
"epoch": 0.48,
|
13449 |
+
"grad_norm": 26.75,
|
13450 |
+
"learning_rate": 7.342372881355932e-06,
|
13451 |
+
"loss": 1.322,
|
13452 |
+
"step": 19170
|
13453 |
+
},
|
13454 |
+
{
|
13455 |
+
"epoch": 0.48,
|
13456 |
+
"grad_norm": 30.625,
|
13457 |
+
"learning_rate": 7.3355932203389835e-06,
|
13458 |
+
"loss": 1.4052,
|
13459 |
+
"step": 19180
|
13460 |
+
},
|
13461 |
+
{
|
13462 |
+
"epoch": 0.48,
|
13463 |
+
"grad_norm": 27.125,
|
13464 |
+
"learning_rate": 7.328813559322034e-06,
|
13465 |
+
"loss": 1.3729,
|
13466 |
+
"step": 19190
|
13467 |
+
},
|
13468 |
+
{
|
13469 |
+
"epoch": 0.48,
|
13470 |
+
"grad_norm": 36.0,
|
13471 |
+
"learning_rate": 7.3220338983050855e-06,
|
13472 |
+
"loss": 1.225,
|
13473 |
+
"step": 19200
|
13474 |
+
},
|
13475 |
+
{
|
13476 |
+
"epoch": 0.48,
|
13477 |
+
"grad_norm": 11.375,
|
13478 |
+
"learning_rate": 7.315254237288136e-06,
|
13479 |
+
"loss": 1.3407,
|
13480 |
+
"step": 19210
|
13481 |
+
},
|
13482 |
+
{
|
13483 |
+
"epoch": 0.48,
|
13484 |
+
"grad_norm": 40.0,
|
13485 |
+
"learning_rate": 7.3084745762711874e-06,
|
13486 |
+
"loss": 1.3733,
|
13487 |
+
"step": 19220
|
13488 |
+
},
|
13489 |
+
{
|
13490 |
+
"epoch": 0.48,
|
13491 |
+
"grad_norm": 42.5,
|
13492 |
+
"learning_rate": 7.301694915254238e-06,
|
13493 |
+
"loss": 1.2025,
|
13494 |
+
"step": 19230
|
13495 |
+
},
|
13496 |
+
{
|
13497 |
+
"epoch": 0.48,
|
13498 |
+
"grad_norm": 54.25,
|
13499 |
+
"learning_rate": 7.294915254237289e-06,
|
13500 |
+
"loss": 1.337,
|
13501 |
+
"step": 19240
|
13502 |
+
},
|
13503 |
+
{
|
13504 |
+
"epoch": 0.48,
|
13505 |
+
"grad_norm": 22.75,
|
13506 |
+
"learning_rate": 7.288135593220339e-06,
|
13507 |
+
"loss": 1.4894,
|
13508 |
+
"step": 19250
|
13509 |
+
},
|
13510 |
+
{
|
13511 |
+
"epoch": 0.48,
|
13512 |
+
"grad_norm": 14.375,
|
13513 |
+
"learning_rate": 7.28135593220339e-06,
|
13514 |
+
"loss": 1.4047,
|
13515 |
+
"step": 19260
|
13516 |
+
},
|
13517 |
+
{
|
13518 |
+
"epoch": 0.48,
|
13519 |
+
"grad_norm": 44.5,
|
13520 |
+
"learning_rate": 7.274576271186441e-06,
|
13521 |
+
"loss": 1.3092,
|
13522 |
+
"step": 19270
|
13523 |
+
},
|
13524 |
+
{
|
13525 |
+
"epoch": 0.48,
|
13526 |
+
"grad_norm": 29.75,
|
13527 |
+
"learning_rate": 7.267796610169492e-06,
|
13528 |
+
"loss": 1.326,
|
13529 |
+
"step": 19280
|
13530 |
+
},
|
13531 |
+
{
|
13532 |
+
"epoch": 0.48,
|
13533 |
+
"grad_norm": 35.0,
|
13534 |
+
"learning_rate": 7.261016949152543e-06,
|
13535 |
+
"loss": 1.4019,
|
13536 |
+
"step": 19290
|
13537 |
+
},
|
13538 |
+
{
|
13539 |
+
"epoch": 0.48,
|
13540 |
+
"grad_norm": 22.5,
|
13541 |
+
"learning_rate": 7.2542372881355936e-06,
|
13542 |
+
"loss": 1.5459,
|
13543 |
+
"step": 19300
|
13544 |
+
},
|
13545 |
+
{
|
13546 |
+
"epoch": 0.48,
|
13547 |
+
"grad_norm": 16.25,
|
13548 |
+
"learning_rate": 7.247457627118645e-06,
|
13549 |
+
"loss": 1.119,
|
13550 |
+
"step": 19310
|
13551 |
+
},
|
13552 |
+
{
|
13553 |
+
"epoch": 0.48,
|
13554 |
+
"grad_norm": 22.75,
|
13555 |
+
"learning_rate": 7.2406779661016955e-06,
|
13556 |
+
"loss": 1.2458,
|
13557 |
+
"step": 19320
|
13558 |
+
},
|
13559 |
+
{
|
13560 |
+
"epoch": 0.48,
|
13561 |
+
"grad_norm": 15.0,
|
13562 |
+
"learning_rate": 7.233898305084747e-06,
|
13563 |
+
"loss": 1.2145,
|
13564 |
+
"step": 19330
|
13565 |
+
},
|
13566 |
+
{
|
13567 |
+
"epoch": 0.48,
|
13568 |
+
"grad_norm": 13.9375,
|
13569 |
+
"learning_rate": 7.2271186440677975e-06,
|
13570 |
+
"loss": 1.2367,
|
13571 |
+
"step": 19340
|
13572 |
+
},
|
13573 |
+
{
|
13574 |
+
"epoch": 0.48,
|
13575 |
+
"grad_norm": 22.875,
|
13576 |
+
"learning_rate": 7.220338983050849e-06,
|
13577 |
+
"loss": 1.409,
|
13578 |
+
"step": 19350
|
13579 |
+
},
|
13580 |
+
{
|
13581 |
+
"epoch": 0.48,
|
13582 |
+
"grad_norm": 25.75,
|
13583 |
+
"learning_rate": 7.2135593220338986e-06,
|
13584 |
+
"loss": 1.4789,
|
13585 |
+
"step": 19360
|
13586 |
+
},
|
13587 |
+
{
|
13588 |
+
"epoch": 0.48,
|
13589 |
+
"grad_norm": 8.5,
|
13590 |
+
"learning_rate": 7.206779661016949e-06,
|
13591 |
+
"loss": 1.1454,
|
13592 |
+
"step": 19370
|
13593 |
+
},
|
13594 |
+
{
|
13595 |
+
"epoch": 0.48,
|
13596 |
+
"grad_norm": 40.0,
|
13597 |
+
"learning_rate": 7.2000000000000005e-06,
|
13598 |
+
"loss": 1.4167,
|
13599 |
+
"step": 19380
|
13600 |
+
},
|
13601 |
+
{
|
13602 |
+
"epoch": 0.48,
|
13603 |
+
"grad_norm": 17.875,
|
13604 |
+
"learning_rate": 7.193220338983051e-06,
|
13605 |
+
"loss": 1.4434,
|
13606 |
+
"step": 19390
|
13607 |
+
},
|
13608 |
+
{
|
13609 |
+
"epoch": 0.48,
|
13610 |
+
"grad_norm": 13.4375,
|
13611 |
+
"learning_rate": 7.1864406779661025e-06,
|
13612 |
+
"loss": 1.3475,
|
13613 |
+
"step": 19400
|
13614 |
+
},
|
13615 |
+
{
|
13616 |
+
"epoch": 0.49,
|
13617 |
+
"grad_norm": 19.75,
|
13618 |
+
"learning_rate": 7.179661016949153e-06,
|
13619 |
+
"loss": 1.4455,
|
13620 |
+
"step": 19410
|
13621 |
+
},
|
13622 |
+
{
|
13623 |
+
"epoch": 0.49,
|
13624 |
+
"grad_norm": 18.625,
|
13625 |
+
"learning_rate": 7.1728813559322044e-06,
|
13626 |
+
"loss": 1.2853,
|
13627 |
+
"step": 19420
|
13628 |
+
},
|
13629 |
+
{
|
13630 |
+
"epoch": 0.49,
|
13631 |
+
"grad_norm": 39.5,
|
13632 |
+
"learning_rate": 7.166101694915255e-06,
|
13633 |
+
"loss": 1.5275,
|
13634 |
+
"step": 19430
|
13635 |
+
},
|
13636 |
+
{
|
13637 |
+
"epoch": 0.49,
|
13638 |
+
"grad_norm": 26.375,
|
13639 |
+
"learning_rate": 7.159322033898306e-06,
|
13640 |
+
"loss": 1.4343,
|
13641 |
+
"step": 19440
|
13642 |
+
},
|
13643 |
+
{
|
13644 |
+
"epoch": 0.49,
|
13645 |
+
"grad_norm": 46.5,
|
13646 |
+
"learning_rate": 7.152542372881357e-06,
|
13647 |
+
"loss": 1.2769,
|
13648 |
+
"step": 19450
|
13649 |
+
},
|
13650 |
+
{
|
13651 |
+
"epoch": 0.49,
|
13652 |
+
"grad_norm": 17.625,
|
13653 |
+
"learning_rate": 7.145762711864407e-06,
|
13654 |
+
"loss": 1.4347,
|
13655 |
+
"step": 19460
|
13656 |
+
},
|
13657 |
+
{
|
13658 |
+
"epoch": 0.49,
|
13659 |
+
"grad_norm": 22.625,
|
13660 |
+
"learning_rate": 7.138983050847458e-06,
|
13661 |
+
"loss": 1.3595,
|
13662 |
+
"step": 19470
|
13663 |
+
},
|
13664 |
+
{
|
13665 |
+
"epoch": 0.49,
|
13666 |
+
"grad_norm": 19.125,
|
13667 |
+
"learning_rate": 7.132203389830509e-06,
|
13668 |
+
"loss": 1.2369,
|
13669 |
+
"step": 19480
|
13670 |
+
},
|
13671 |
+
{
|
13672 |
+
"epoch": 0.49,
|
13673 |
+
"grad_norm": 14.1875,
|
13674 |
+
"learning_rate": 7.12542372881356e-06,
|
13675 |
+
"loss": 1.2498,
|
13676 |
+
"step": 19490
|
13677 |
+
},
|
13678 |
+
{
|
13679 |
+
"epoch": 0.49,
|
13680 |
+
"grad_norm": 11.5625,
|
13681 |
+
"learning_rate": 7.1186440677966106e-06,
|
13682 |
+
"loss": 1.2684,
|
13683 |
+
"step": 19500
|
13684 |
+
},
|
13685 |
+
{
|
13686 |
+
"epoch": 0.49,
|
13687 |
+
"grad_norm": 14.625,
|
13688 |
+
"learning_rate": 7.111864406779662e-06,
|
13689 |
+
"loss": 1.4037,
|
13690 |
+
"step": 19510
|
13691 |
+
},
|
13692 |
+
{
|
13693 |
+
"epoch": 0.49,
|
13694 |
+
"grad_norm": 26.375,
|
13695 |
+
"learning_rate": 7.1050847457627125e-06,
|
13696 |
+
"loss": 1.3848,
|
13697 |
+
"step": 19520
|
13698 |
+
},
|
13699 |
+
{
|
13700 |
+
"epoch": 0.49,
|
13701 |
+
"grad_norm": 23.875,
|
13702 |
+
"learning_rate": 7.098305084745764e-06,
|
13703 |
+
"loss": 1.2486,
|
13704 |
+
"step": 19530
|
13705 |
+
},
|
13706 |
+
{
|
13707 |
+
"epoch": 0.49,
|
13708 |
+
"grad_norm": 19.125,
|
13709 |
+
"learning_rate": 7.0915254237288145e-06,
|
13710 |
+
"loss": 1.3605,
|
13711 |
+
"step": 19540
|
13712 |
+
},
|
13713 |
+
{
|
13714 |
+
"epoch": 0.49,
|
13715 |
+
"grad_norm": 16.0,
|
13716 |
+
"learning_rate": 7.084745762711865e-06,
|
13717 |
+
"loss": 1.3836,
|
13718 |
+
"step": 19550
|
13719 |
+
},
|
13720 |
+
{
|
13721 |
+
"epoch": 0.49,
|
13722 |
+
"grad_norm": 15.25,
|
13723 |
+
"learning_rate": 7.077966101694916e-06,
|
13724 |
+
"loss": 1.2622,
|
13725 |
+
"step": 19560
|
13726 |
+
},
|
13727 |
+
{
|
13728 |
+
"epoch": 0.49,
|
13729 |
+
"grad_norm": 10.375,
|
13730 |
+
"learning_rate": 7.071186440677966e-06,
|
13731 |
+
"loss": 1.2034,
|
13732 |
+
"step": 19570
|
13733 |
+
},
|
13734 |
+
{
|
13735 |
+
"epoch": 0.49,
|
13736 |
+
"grad_norm": 26.625,
|
13737 |
+
"learning_rate": 7.0644067796610175e-06,
|
13738 |
+
"loss": 1.2822,
|
13739 |
+
"step": 19580
|
13740 |
+
},
|
13741 |
+
{
|
13742 |
+
"epoch": 0.49,
|
13743 |
+
"grad_norm": 20.0,
|
13744 |
+
"learning_rate": 7.057627118644068e-06,
|
13745 |
+
"loss": 1.2962,
|
13746 |
+
"step": 19590
|
13747 |
+
},
|
13748 |
+
{
|
13749 |
+
"epoch": 0.49,
|
13750 |
+
"grad_norm": 54.25,
|
13751 |
+
"learning_rate": 7.0508474576271195e-06,
|
13752 |
+
"loss": 1.305,
|
13753 |
+
"step": 19600
|
13754 |
+
},
|
13755 |
+
{
|
13756 |
+
"epoch": 0.49,
|
13757 |
+
"grad_norm": 13.375,
|
13758 |
+
"learning_rate": 7.04406779661017e-06,
|
13759 |
+
"loss": 1.2614,
|
13760 |
+
"step": 19610
|
13761 |
+
},
|
13762 |
+
{
|
13763 |
+
"epoch": 0.49,
|
13764 |
+
"grad_norm": 16.75,
|
13765 |
+
"learning_rate": 7.037288135593221e-06,
|
13766 |
+
"loss": 1.375,
|
13767 |
+
"step": 19620
|
13768 |
+
},
|
13769 |
+
{
|
13770 |
+
"epoch": 0.49,
|
13771 |
+
"grad_norm": 33.5,
|
13772 |
+
"learning_rate": 7.030508474576272e-06,
|
13773 |
+
"loss": 1.2791,
|
13774 |
+
"step": 19630
|
13775 |
+
},
|
13776 |
+
{
|
13777 |
+
"epoch": 0.49,
|
13778 |
+
"grad_norm": 35.5,
|
13779 |
+
"learning_rate": 7.0237288135593225e-06,
|
13780 |
+
"loss": 1.2757,
|
13781 |
+
"step": 19640
|
13782 |
+
},
|
13783 |
+
{
|
13784 |
+
"epoch": 0.49,
|
13785 |
+
"grad_norm": 14.3125,
|
13786 |
+
"learning_rate": 7.016949152542374e-06,
|
13787 |
+
"loss": 1.1156,
|
13788 |
+
"step": 19650
|
13789 |
+
},
|
13790 |
+
{
|
13791 |
+
"epoch": 0.49,
|
13792 |
+
"grad_norm": 13.4375,
|
13793 |
+
"learning_rate": 7.0101694915254245e-06,
|
13794 |
+
"loss": 1.3131,
|
13795 |
+
"step": 19660
|
13796 |
+
},
|
13797 |
+
{
|
13798 |
+
"epoch": 0.49,
|
13799 |
+
"grad_norm": 32.25,
|
13800 |
+
"learning_rate": 7.003389830508475e-06,
|
13801 |
+
"loss": 1.2609,
|
13802 |
+
"step": 19670
|
13803 |
+
},
|
13804 |
+
{
|
13805 |
+
"epoch": 0.49,
|
13806 |
+
"grad_norm": 18.875,
|
13807 |
+
"learning_rate": 6.996610169491526e-06,
|
13808 |
+
"loss": 1.2951,
|
13809 |
+
"step": 19680
|
13810 |
+
},
|
13811 |
+
{
|
13812 |
+
"epoch": 0.49,
|
13813 |
+
"grad_norm": 7.625,
|
13814 |
+
"learning_rate": 6.989830508474576e-06,
|
13815 |
+
"loss": 1.2051,
|
13816 |
+
"step": 19690
|
13817 |
+
},
|
13818 |
+
{
|
13819 |
+
"epoch": 0.49,
|
13820 |
+
"grad_norm": 28.375,
|
13821 |
+
"learning_rate": 6.9830508474576275e-06,
|
13822 |
+
"loss": 1.4015,
|
13823 |
+
"step": 19700
|
13824 |
+
},
|
13825 |
+
{
|
13826 |
+
"epoch": 0.49,
|
13827 |
+
"grad_norm": 26.75,
|
13828 |
+
"learning_rate": 6.976271186440678e-06,
|
13829 |
+
"loss": 1.5129,
|
13830 |
+
"step": 19710
|
13831 |
+
},
|
13832 |
+
{
|
13833 |
+
"epoch": 0.49,
|
13834 |
+
"grad_norm": 15.0,
|
13835 |
+
"learning_rate": 6.9694915254237295e-06,
|
13836 |
+
"loss": 1.3309,
|
13837 |
+
"step": 19720
|
13838 |
+
},
|
13839 |
+
{
|
13840 |
+
"epoch": 0.49,
|
13841 |
+
"grad_norm": 57.5,
|
13842 |
+
"learning_rate": 6.96271186440678e-06,
|
13843 |
+
"loss": 1.3555,
|
13844 |
+
"step": 19730
|
13845 |
+
},
|
13846 |
+
{
|
13847 |
+
"epoch": 0.49,
|
13848 |
+
"grad_norm": 63.75,
|
13849 |
+
"learning_rate": 6.9559322033898315e-06,
|
13850 |
+
"loss": 1.3705,
|
13851 |
+
"step": 19740
|
13852 |
+
},
|
13853 |
+
{
|
13854 |
+
"epoch": 0.49,
|
13855 |
+
"grad_norm": 41.5,
|
13856 |
+
"learning_rate": 6.949152542372882e-06,
|
13857 |
+
"loss": 1.4068,
|
13858 |
+
"step": 19750
|
13859 |
+
},
|
13860 |
+
{
|
13861 |
+
"epoch": 0.49,
|
13862 |
+
"grad_norm": 27.75,
|
13863 |
+
"learning_rate": 6.942372881355933e-06,
|
13864 |
+
"loss": 1.3491,
|
13865 |
+
"step": 19760
|
13866 |
+
},
|
13867 |
+
{
|
13868 |
+
"epoch": 0.49,
|
13869 |
+
"grad_norm": 17.375,
|
13870 |
+
"learning_rate": 6.935593220338983e-06,
|
13871 |
+
"loss": 1.5199,
|
13872 |
+
"step": 19770
|
13873 |
+
},
|
13874 |
+
{
|
13875 |
+
"epoch": 0.49,
|
13876 |
+
"grad_norm": 24.75,
|
13877 |
+
"learning_rate": 6.928813559322034e-06,
|
13878 |
+
"loss": 1.3964,
|
13879 |
+
"step": 19780
|
13880 |
+
},
|
13881 |
+
{
|
13882 |
+
"epoch": 0.49,
|
13883 |
+
"grad_norm": 49.75,
|
13884 |
+
"learning_rate": 6.922033898305085e-06,
|
13885 |
+
"loss": 1.0419,
|
13886 |
+
"step": 19790
|
13887 |
+
},
|
13888 |
+
{
|
13889 |
+
"epoch": 0.49,
|
13890 |
+
"grad_norm": 28.25,
|
13891 |
+
"learning_rate": 6.915254237288136e-06,
|
13892 |
+
"loss": 1.4914,
|
13893 |
+
"step": 19800
|
13894 |
+
},
|
13895 |
+
{
|
13896 |
+
"epoch": 0.5,
|
13897 |
+
"grad_norm": 36.75,
|
13898 |
+
"learning_rate": 6.908474576271187e-06,
|
13899 |
+
"loss": 1.37,
|
13900 |
+
"step": 19810
|
13901 |
+
},
|
13902 |
+
{
|
13903 |
+
"epoch": 0.5,
|
13904 |
+
"grad_norm": 9.9375,
|
13905 |
+
"learning_rate": 6.901694915254238e-06,
|
13906 |
+
"loss": 1.2354,
|
13907 |
+
"step": 19820
|
13908 |
+
},
|
13909 |
+
{
|
13910 |
+
"epoch": 0.5,
|
13911 |
+
"grad_norm": 36.75,
|
13912 |
+
"learning_rate": 6.894915254237289e-06,
|
13913 |
+
"loss": 1.1537,
|
13914 |
+
"step": 19830
|
13915 |
+
},
|
13916 |
+
{
|
13917 |
+
"epoch": 0.5,
|
13918 |
+
"grad_norm": 24.125,
|
13919 |
+
"learning_rate": 6.8881355932203395e-06,
|
13920 |
+
"loss": 1.1772,
|
13921 |
+
"step": 19840
|
13922 |
+
},
|
13923 |
+
{
|
13924 |
+
"epoch": 0.5,
|
13925 |
+
"grad_norm": 13.4375,
|
13926 |
+
"learning_rate": 6.881355932203391e-06,
|
13927 |
+
"loss": 1.3237,
|
13928 |
+
"step": 19850
|
13929 |
+
},
|
13930 |
+
{
|
13931 |
+
"epoch": 0.5,
|
13932 |
+
"grad_norm": 17.875,
|
13933 |
+
"learning_rate": 6.8745762711864415e-06,
|
13934 |
+
"loss": 1.1969,
|
13935 |
+
"step": 19860
|
13936 |
+
},
|
13937 |
+
{
|
13938 |
+
"epoch": 0.5,
|
13939 |
+
"grad_norm": 20.125,
|
13940 |
+
"learning_rate": 6.867796610169493e-06,
|
13941 |
+
"loss": 1.4368,
|
13942 |
+
"step": 19870
|
13943 |
+
},
|
13944 |
+
{
|
13945 |
+
"epoch": 0.5,
|
13946 |
+
"grad_norm": 35.5,
|
13947 |
+
"learning_rate": 6.861016949152543e-06,
|
13948 |
+
"loss": 1.281,
|
13949 |
+
"step": 19880
|
13950 |
+
},
|
13951 |
+
{
|
13952 |
+
"epoch": 0.5,
|
13953 |
+
"grad_norm": 10.6875,
|
13954 |
+
"learning_rate": 6.854237288135593e-06,
|
13955 |
+
"loss": 1.3889,
|
13956 |
+
"step": 19890
|
13957 |
+
},
|
13958 |
+
{
|
13959 |
+
"epoch": 0.5,
|
13960 |
+
"grad_norm": 25.5,
|
13961 |
+
"learning_rate": 6.8474576271186445e-06,
|
13962 |
+
"loss": 1.3965,
|
13963 |
+
"step": 19900
|
13964 |
+
},
|
13965 |
+
{
|
13966 |
+
"epoch": 0.5,
|
13967 |
+
"grad_norm": 41.75,
|
13968 |
+
"learning_rate": 6.840677966101695e-06,
|
13969 |
+
"loss": 1.2596,
|
13970 |
+
"step": 19910
|
13971 |
+
},
|
13972 |
+
{
|
13973 |
+
"epoch": 0.5,
|
13974 |
+
"grad_norm": 27.25,
|
13975 |
+
"learning_rate": 6.8338983050847465e-06,
|
13976 |
+
"loss": 1.3484,
|
13977 |
+
"step": 19920
|
13978 |
+
},
|
13979 |
+
{
|
13980 |
+
"epoch": 0.5,
|
13981 |
+
"grad_norm": 22.5,
|
13982 |
+
"learning_rate": 6.827118644067797e-06,
|
13983 |
+
"loss": 1.4299,
|
13984 |
+
"step": 19930
|
13985 |
+
},
|
13986 |
+
{
|
13987 |
+
"epoch": 0.5,
|
13988 |
+
"grad_norm": 16.375,
|
13989 |
+
"learning_rate": 6.8203389830508485e-06,
|
13990 |
+
"loss": 1.2868,
|
13991 |
+
"step": 19940
|
13992 |
+
},
|
13993 |
+
{
|
13994 |
+
"epoch": 0.5,
|
13995 |
+
"grad_norm": 18.25,
|
13996 |
+
"learning_rate": 6.813559322033899e-06,
|
13997 |
+
"loss": 1.2739,
|
13998 |
+
"step": 19950
|
13999 |
+
},
|
14000 |
+
{
|
14001 |
+
"epoch": 0.5,
|
14002 |
+
"grad_norm": 50.0,
|
14003 |
+
"learning_rate": 6.80677966101695e-06,
|
14004 |
+
"loss": 1.4644,
|
14005 |
+
"step": 19960
|
14006 |
+
},
|
14007 |
+
{
|
14008 |
+
"epoch": 0.5,
|
14009 |
+
"grad_norm": 38.5,
|
14010 |
+
"learning_rate": 6.800000000000001e-06,
|
14011 |
+
"loss": 1.4472,
|
14012 |
+
"step": 19970
|
14013 |
+
},
|
14014 |
+
{
|
14015 |
+
"epoch": 0.5,
|
14016 |
+
"grad_norm": 40.25,
|
14017 |
+
"learning_rate": 6.793220338983051e-06,
|
14018 |
+
"loss": 1.4583,
|
14019 |
+
"step": 19980
|
14020 |
+
},
|
14021 |
+
{
|
14022 |
+
"epoch": 0.5,
|
14023 |
+
"grad_norm": 52.0,
|
14024 |
+
"learning_rate": 6.786440677966102e-06,
|
14025 |
+
"loss": 1.4355,
|
14026 |
+
"step": 19990
|
14027 |
+
},
|
14028 |
+
{
|
14029 |
+
"epoch": 0.5,
|
14030 |
+
"grad_norm": 41.25,
|
14031 |
+
"learning_rate": 6.779661016949153e-06,
|
14032 |
+
"loss": 1.451,
|
14033 |
+
"step": 20000
|
14034 |
+
},
|
14035 |
+
{
|
14036 |
+
"epoch": 0.5,
|
14037 |
+
"eval_loss": 1.349025845527649,
|
14038 |
+
"eval_runtime": 59.2373,
|
14039 |
+
"eval_samples_per_second": 16.881,
|
14040 |
+
"eval_steps_per_second": 16.881,
|
14041 |
+
"step": 20000
|
14042 |
}
|
14043 |
],
|
14044 |
"logging_steps": 10,
|
|
|
14046 |
"num_input_tokens_seen": 0,
|
14047 |
"num_train_epochs": 1,
|
14048 |
"save_steps": 5000,
|
14049 |
+
"total_flos": 3.1467396661248e+17,
|
14050 |
"train_batch_size": 1,
|
14051 |
"trial_name": null,
|
14052 |
"trial_params": null
|