Uploaded checkpoint-20000
Browse files- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1795 -5
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 119975656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:140c22cb100bb7fb3b9f92ae13ec5bb2bfcde7ed82d7e4434fc5a235f98cb24e
|
3 |
size 119975656
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 240145026
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f49faa425cc20765775b424c81e8f5599e3725a2dc79226d42d68c4573812cfe
|
3 |
size 240145026
|
rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d70b07077c15c8bd127eaf0a24ba45e81ca7ce6ae410b7a625f50c345ec6eb1f
|
3 |
size 14244
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7dc694a733ff91b79c5eaf7bcfe8aa41771c4ef8a47d325d2a9e9f6bc78f946
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
-
"best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -12537,6 +12537,1796 @@
|
|
12537 |
"eval_samples_per_second": 15.113,
|
12538 |
"eval_steps_per_second": 15.113,
|
12539 |
"step": 17500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12540 |
}
|
12541 |
],
|
12542 |
"logging_steps": 10,
|
@@ -12544,7 +14334,7 @@
|
|
12544 |
"num_input_tokens_seen": 0,
|
12545 |
"num_train_epochs": 1,
|
12546 |
"save_steps": 2500,
|
12547 |
-
"total_flos":
|
12548 |
"train_batch_size": 1,
|
12549 |
"trial_name": null,
|
12550 |
"trial_params": null
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.348677158355713,
|
3 |
+
"best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-20000",
|
4 |
+
"epoch": 0.5,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 20000,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
12537 |
"eval_samples_per_second": 15.113,
|
12538 |
"eval_steps_per_second": 15.113,
|
12539 |
"step": 17500
|
12540 |
+
},
|
12541 |
+
{
|
12542 |
+
"epoch": 0.44,
|
12543 |
+
"grad_norm": 5.189449310302734,
|
12544 |
+
"learning_rate": 8.467796610169492e-06,
|
12545 |
+
"loss": 1.352,
|
12546 |
+
"step": 17510
|
12547 |
+
},
|
12548 |
+
{
|
12549 |
+
"epoch": 0.44,
|
12550 |
+
"grad_norm": 2.765326499938965,
|
12551 |
+
"learning_rate": 8.461016949152543e-06,
|
12552 |
+
"loss": 1.5275,
|
12553 |
+
"step": 17520
|
12554 |
+
},
|
12555 |
+
{
|
12556 |
+
"epoch": 0.44,
|
12557 |
+
"grad_norm": 3.60880446434021,
|
12558 |
+
"learning_rate": 8.454237288135593e-06,
|
12559 |
+
"loss": 1.2793,
|
12560 |
+
"step": 17530
|
12561 |
+
},
|
12562 |
+
{
|
12563 |
+
"epoch": 0.44,
|
12564 |
+
"grad_norm": 3.072174310684204,
|
12565 |
+
"learning_rate": 8.447457627118644e-06,
|
12566 |
+
"loss": 1.2746,
|
12567 |
+
"step": 17540
|
12568 |
+
},
|
12569 |
+
{
|
12570 |
+
"epoch": 0.44,
|
12571 |
+
"grad_norm": 3.3222594261169434,
|
12572 |
+
"learning_rate": 8.440677966101696e-06,
|
12573 |
+
"loss": 1.3359,
|
12574 |
+
"step": 17550
|
12575 |
+
},
|
12576 |
+
{
|
12577 |
+
"epoch": 0.44,
|
12578 |
+
"grad_norm": 1.3298100233078003,
|
12579 |
+
"learning_rate": 8.433898305084747e-06,
|
12580 |
+
"loss": 1.3826,
|
12581 |
+
"step": 17560
|
12582 |
+
},
|
12583 |
+
{
|
12584 |
+
"epoch": 0.44,
|
12585 |
+
"grad_norm": 4.06503438949585,
|
12586 |
+
"learning_rate": 8.427118644067797e-06,
|
12587 |
+
"loss": 1.3339,
|
12588 |
+
"step": 17570
|
12589 |
+
},
|
12590 |
+
{
|
12591 |
+
"epoch": 0.44,
|
12592 |
+
"grad_norm": 6.150786399841309,
|
12593 |
+
"learning_rate": 8.420338983050848e-06,
|
12594 |
+
"loss": 1.3414,
|
12595 |
+
"step": 17580
|
12596 |
+
},
|
12597 |
+
{
|
12598 |
+
"epoch": 0.44,
|
12599 |
+
"grad_norm": 9.045748710632324,
|
12600 |
+
"learning_rate": 8.413559322033898e-06,
|
12601 |
+
"loss": 1.3183,
|
12602 |
+
"step": 17590
|
12603 |
+
},
|
12604 |
+
{
|
12605 |
+
"epoch": 0.44,
|
12606 |
+
"grad_norm": 4.415981769561768,
|
12607 |
+
"learning_rate": 8.40677966101695e-06,
|
12608 |
+
"loss": 1.3323,
|
12609 |
+
"step": 17600
|
12610 |
+
},
|
12611 |
+
{
|
12612 |
+
"epoch": 0.44,
|
12613 |
+
"grad_norm": 8.40713882446289,
|
12614 |
+
"learning_rate": 8.400000000000001e-06,
|
12615 |
+
"loss": 1.2519,
|
12616 |
+
"step": 17610
|
12617 |
+
},
|
12618 |
+
{
|
12619 |
+
"epoch": 0.44,
|
12620 |
+
"grad_norm": 6.841925144195557,
|
12621 |
+
"learning_rate": 8.39322033898305e-06,
|
12622 |
+
"loss": 1.4799,
|
12623 |
+
"step": 17620
|
12624 |
+
},
|
12625 |
+
{
|
12626 |
+
"epoch": 0.44,
|
12627 |
+
"grad_norm": 10.993582725524902,
|
12628 |
+
"learning_rate": 8.386440677966102e-06,
|
12629 |
+
"loss": 1.3495,
|
12630 |
+
"step": 17630
|
12631 |
+
},
|
12632 |
+
{
|
12633 |
+
"epoch": 0.44,
|
12634 |
+
"grad_norm": 2.115006685256958,
|
12635 |
+
"learning_rate": 8.379661016949153e-06,
|
12636 |
+
"loss": 1.3902,
|
12637 |
+
"step": 17640
|
12638 |
+
},
|
12639 |
+
{
|
12640 |
+
"epoch": 0.44,
|
12641 |
+
"grad_norm": 3.485889434814453,
|
12642 |
+
"learning_rate": 8.372881355932205e-06,
|
12643 |
+
"loss": 1.4462,
|
12644 |
+
"step": 17650
|
12645 |
+
},
|
12646 |
+
{
|
12647 |
+
"epoch": 0.44,
|
12648 |
+
"grad_norm": 5.725930690765381,
|
12649 |
+
"learning_rate": 8.366101694915255e-06,
|
12650 |
+
"loss": 1.4603,
|
12651 |
+
"step": 17660
|
12652 |
+
},
|
12653 |
+
{
|
12654 |
+
"epoch": 0.44,
|
12655 |
+
"grad_norm": 6.5530548095703125,
|
12656 |
+
"learning_rate": 8.359322033898306e-06,
|
12657 |
+
"loss": 1.4954,
|
12658 |
+
"step": 17670
|
12659 |
+
},
|
12660 |
+
{
|
12661 |
+
"epoch": 0.44,
|
12662 |
+
"grad_norm": 6.006740570068359,
|
12663 |
+
"learning_rate": 8.352542372881357e-06,
|
12664 |
+
"loss": 1.3556,
|
12665 |
+
"step": 17680
|
12666 |
+
},
|
12667 |
+
{
|
12668 |
+
"epoch": 0.44,
|
12669 |
+
"grad_norm": 5.5967936515808105,
|
12670 |
+
"learning_rate": 8.345762711864409e-06,
|
12671 |
+
"loss": 1.0719,
|
12672 |
+
"step": 17690
|
12673 |
+
},
|
12674 |
+
{
|
12675 |
+
"epoch": 0.44,
|
12676 |
+
"grad_norm": 5.70643949508667,
|
12677 |
+
"learning_rate": 8.338983050847458e-06,
|
12678 |
+
"loss": 1.2968,
|
12679 |
+
"step": 17700
|
12680 |
+
},
|
12681 |
+
{
|
12682 |
+
"epoch": 0.44,
|
12683 |
+
"grad_norm": 3.280465841293335,
|
12684 |
+
"learning_rate": 8.332203389830508e-06,
|
12685 |
+
"loss": 1.2669,
|
12686 |
+
"step": 17710
|
12687 |
+
},
|
12688 |
+
{
|
12689 |
+
"epoch": 0.44,
|
12690 |
+
"grad_norm": 3.098782777786255,
|
12691 |
+
"learning_rate": 8.32542372881356e-06,
|
12692 |
+
"loss": 1.4704,
|
12693 |
+
"step": 17720
|
12694 |
+
},
|
12695 |
+
{
|
12696 |
+
"epoch": 0.44,
|
12697 |
+
"grad_norm": 2.825495958328247,
|
12698 |
+
"learning_rate": 8.318644067796611e-06,
|
12699 |
+
"loss": 1.2165,
|
12700 |
+
"step": 17730
|
12701 |
+
},
|
12702 |
+
{
|
12703 |
+
"epoch": 0.44,
|
12704 |
+
"grad_norm": 3.8040249347686768,
|
12705 |
+
"learning_rate": 8.311864406779662e-06,
|
12706 |
+
"loss": 1.3623,
|
12707 |
+
"step": 17740
|
12708 |
+
},
|
12709 |
+
{
|
12710 |
+
"epoch": 0.44,
|
12711 |
+
"grad_norm": 3.4438130855560303,
|
12712 |
+
"learning_rate": 8.305084745762712e-06,
|
12713 |
+
"loss": 1.4255,
|
12714 |
+
"step": 17750
|
12715 |
+
},
|
12716 |
+
{
|
12717 |
+
"epoch": 0.44,
|
12718 |
+
"grad_norm": 2.187945604324341,
|
12719 |
+
"learning_rate": 8.298305084745763e-06,
|
12720 |
+
"loss": 1.2181,
|
12721 |
+
"step": 17760
|
12722 |
+
},
|
12723 |
+
{
|
12724 |
+
"epoch": 0.44,
|
12725 |
+
"grad_norm": 2.0644514560699463,
|
12726 |
+
"learning_rate": 8.291525423728815e-06,
|
12727 |
+
"loss": 1.4669,
|
12728 |
+
"step": 17770
|
12729 |
+
},
|
12730 |
+
{
|
12731 |
+
"epoch": 0.44,
|
12732 |
+
"grad_norm": 4.314215660095215,
|
12733 |
+
"learning_rate": 8.284745762711866e-06,
|
12734 |
+
"loss": 1.3776,
|
12735 |
+
"step": 17780
|
12736 |
+
},
|
12737 |
+
{
|
12738 |
+
"epoch": 0.44,
|
12739 |
+
"grad_norm": 2.948415517807007,
|
12740 |
+
"learning_rate": 8.277966101694916e-06,
|
12741 |
+
"loss": 1.3198,
|
12742 |
+
"step": 17790
|
12743 |
+
},
|
12744 |
+
{
|
12745 |
+
"epoch": 0.45,
|
12746 |
+
"grad_norm": 7.77257776260376,
|
12747 |
+
"learning_rate": 8.271186440677966e-06,
|
12748 |
+
"loss": 1.3826,
|
12749 |
+
"step": 17800
|
12750 |
+
},
|
12751 |
+
{
|
12752 |
+
"epoch": 0.45,
|
12753 |
+
"grad_norm": 2.2085230350494385,
|
12754 |
+
"learning_rate": 8.264406779661017e-06,
|
12755 |
+
"loss": 1.3443,
|
12756 |
+
"step": 17810
|
12757 |
+
},
|
12758 |
+
{
|
12759 |
+
"epoch": 0.45,
|
12760 |
+
"grad_norm": 2.8318214416503906,
|
12761 |
+
"learning_rate": 8.257627118644068e-06,
|
12762 |
+
"loss": 1.3126,
|
12763 |
+
"step": 17820
|
12764 |
+
},
|
12765 |
+
{
|
12766 |
+
"epoch": 0.45,
|
12767 |
+
"grad_norm": 4.544606685638428,
|
12768 |
+
"learning_rate": 8.25084745762712e-06,
|
12769 |
+
"loss": 1.2955,
|
12770 |
+
"step": 17830
|
12771 |
+
},
|
12772 |
+
{
|
12773 |
+
"epoch": 0.45,
|
12774 |
+
"grad_norm": 5.595729351043701,
|
12775 |
+
"learning_rate": 8.24406779661017e-06,
|
12776 |
+
"loss": 1.3814,
|
12777 |
+
"step": 17840
|
12778 |
+
},
|
12779 |
+
{
|
12780 |
+
"epoch": 0.45,
|
12781 |
+
"grad_norm": 6.201912879943848,
|
12782 |
+
"learning_rate": 8.237288135593221e-06,
|
12783 |
+
"loss": 1.4243,
|
12784 |
+
"step": 17850
|
12785 |
+
},
|
12786 |
+
{
|
12787 |
+
"epoch": 0.45,
|
12788 |
+
"grad_norm": 7.697713851928711,
|
12789 |
+
"learning_rate": 8.230508474576272e-06,
|
12790 |
+
"loss": 1.326,
|
12791 |
+
"step": 17860
|
12792 |
+
},
|
12793 |
+
{
|
12794 |
+
"epoch": 0.45,
|
12795 |
+
"grad_norm": 3.7588555812835693,
|
12796 |
+
"learning_rate": 8.223728813559324e-06,
|
12797 |
+
"loss": 1.1265,
|
12798 |
+
"step": 17870
|
12799 |
+
},
|
12800 |
+
{
|
12801 |
+
"epoch": 0.45,
|
12802 |
+
"grad_norm": 8.472000122070312,
|
12803 |
+
"learning_rate": 8.216949152542373e-06,
|
12804 |
+
"loss": 1.2678,
|
12805 |
+
"step": 17880
|
12806 |
+
},
|
12807 |
+
{
|
12808 |
+
"epoch": 0.45,
|
12809 |
+
"grad_norm": 4.802706241607666,
|
12810 |
+
"learning_rate": 8.210169491525425e-06,
|
12811 |
+
"loss": 1.5116,
|
12812 |
+
"step": 17890
|
12813 |
+
},
|
12814 |
+
{
|
12815 |
+
"epoch": 0.45,
|
12816 |
+
"grad_norm": 5.357900142669678,
|
12817 |
+
"learning_rate": 8.203389830508475e-06,
|
12818 |
+
"loss": 1.4435,
|
12819 |
+
"step": 17900
|
12820 |
+
},
|
12821 |
+
{
|
12822 |
+
"epoch": 0.45,
|
12823 |
+
"grad_norm": 3.5331151485443115,
|
12824 |
+
"learning_rate": 8.196610169491526e-06,
|
12825 |
+
"loss": 1.3838,
|
12826 |
+
"step": 17910
|
12827 |
+
},
|
12828 |
+
{
|
12829 |
+
"epoch": 0.45,
|
12830 |
+
"grad_norm": 2.591485023498535,
|
12831 |
+
"learning_rate": 8.189830508474577e-06,
|
12832 |
+
"loss": 1.3595,
|
12833 |
+
"step": 17920
|
12834 |
+
},
|
12835 |
+
{
|
12836 |
+
"epoch": 0.45,
|
12837 |
+
"grad_norm": 9.1065034866333,
|
12838 |
+
"learning_rate": 8.183050847457627e-06,
|
12839 |
+
"loss": 1.4109,
|
12840 |
+
"step": 17930
|
12841 |
+
},
|
12842 |
+
{
|
12843 |
+
"epoch": 0.45,
|
12844 |
+
"grad_norm": 3.2992680072784424,
|
12845 |
+
"learning_rate": 8.176271186440678e-06,
|
12846 |
+
"loss": 1.3554,
|
12847 |
+
"step": 17940
|
12848 |
+
},
|
12849 |
+
{
|
12850 |
+
"epoch": 0.45,
|
12851 |
+
"grad_norm": 10.191650390625,
|
12852 |
+
"learning_rate": 8.16949152542373e-06,
|
12853 |
+
"loss": 1.3561,
|
12854 |
+
"step": 17950
|
12855 |
+
},
|
12856 |
+
{
|
12857 |
+
"epoch": 0.45,
|
12858 |
+
"grad_norm": 5.769218444824219,
|
12859 |
+
"learning_rate": 8.162711864406781e-06,
|
12860 |
+
"loss": 1.4044,
|
12861 |
+
"step": 17960
|
12862 |
+
},
|
12863 |
+
{
|
12864 |
+
"epoch": 0.45,
|
12865 |
+
"grad_norm": 47.62531661987305,
|
12866 |
+
"learning_rate": 8.155932203389831e-06,
|
12867 |
+
"loss": 1.6211,
|
12868 |
+
"step": 17970
|
12869 |
+
},
|
12870 |
+
{
|
12871 |
+
"epoch": 0.45,
|
12872 |
+
"grad_norm": 3.514390230178833,
|
12873 |
+
"learning_rate": 8.149152542372882e-06,
|
12874 |
+
"loss": 1.288,
|
12875 |
+
"step": 17980
|
12876 |
+
},
|
12877 |
+
{
|
12878 |
+
"epoch": 0.45,
|
12879 |
+
"grad_norm": 3.9963278770446777,
|
12880 |
+
"learning_rate": 8.142372881355934e-06,
|
12881 |
+
"loss": 1.3625,
|
12882 |
+
"step": 17990
|
12883 |
+
},
|
12884 |
+
{
|
12885 |
+
"epoch": 0.45,
|
12886 |
+
"grad_norm": 4.2144856452941895,
|
12887 |
+
"learning_rate": 8.135593220338983e-06,
|
12888 |
+
"loss": 1.3154,
|
12889 |
+
"step": 18000
|
12890 |
+
},
|
12891 |
+
{
|
12892 |
+
"epoch": 0.45,
|
12893 |
+
"eval_loss": 1.3663108348846436,
|
12894 |
+
"eval_runtime": 66.1644,
|
12895 |
+
"eval_samples_per_second": 15.114,
|
12896 |
+
"eval_steps_per_second": 15.114,
|
12897 |
+
"step": 18000
|
12898 |
+
},
|
12899 |
+
{
|
12900 |
+
"epoch": 0.45,
|
12901 |
+
"grad_norm": 3.047025680541992,
|
12902 |
+
"learning_rate": 8.128813559322035e-06,
|
12903 |
+
"loss": 1.3131,
|
12904 |
+
"step": 18010
|
12905 |
+
},
|
12906 |
+
{
|
12907 |
+
"epoch": 0.45,
|
12908 |
+
"grad_norm": 5.798911094665527,
|
12909 |
+
"learning_rate": 8.122033898305085e-06,
|
12910 |
+
"loss": 1.4371,
|
12911 |
+
"step": 18020
|
12912 |
+
},
|
12913 |
+
{
|
12914 |
+
"epoch": 0.45,
|
12915 |
+
"grad_norm": 3.8710880279541016,
|
12916 |
+
"learning_rate": 8.115254237288136e-06,
|
12917 |
+
"loss": 1.5147,
|
12918 |
+
"step": 18030
|
12919 |
+
},
|
12920 |
+
{
|
12921 |
+
"epoch": 0.45,
|
12922 |
+
"grad_norm": 6.445448398590088,
|
12923 |
+
"learning_rate": 8.108474576271187e-06,
|
12924 |
+
"loss": 1.3332,
|
12925 |
+
"step": 18040
|
12926 |
+
},
|
12927 |
+
{
|
12928 |
+
"epoch": 0.45,
|
12929 |
+
"grad_norm": 5.0877275466918945,
|
12930 |
+
"learning_rate": 8.101694915254237e-06,
|
12931 |
+
"loss": 1.5076,
|
12932 |
+
"step": 18050
|
12933 |
+
},
|
12934 |
+
{
|
12935 |
+
"epoch": 0.45,
|
12936 |
+
"grad_norm": 6.0062737464904785,
|
12937 |
+
"learning_rate": 8.094915254237289e-06,
|
12938 |
+
"loss": 1.3257,
|
12939 |
+
"step": 18060
|
12940 |
+
},
|
12941 |
+
{
|
12942 |
+
"epoch": 0.45,
|
12943 |
+
"grad_norm": 3.2079477310180664,
|
12944 |
+
"learning_rate": 8.08813559322034e-06,
|
12945 |
+
"loss": 1.2162,
|
12946 |
+
"step": 18070
|
12947 |
+
},
|
12948 |
+
{
|
12949 |
+
"epoch": 0.45,
|
12950 |
+
"grad_norm": 2.974025249481201,
|
12951 |
+
"learning_rate": 8.081355932203391e-06,
|
12952 |
+
"loss": 1.3433,
|
12953 |
+
"step": 18080
|
12954 |
+
},
|
12955 |
+
{
|
12956 |
+
"epoch": 0.45,
|
12957 |
+
"grad_norm": 11.321368217468262,
|
12958 |
+
"learning_rate": 8.074576271186441e-06,
|
12959 |
+
"loss": 1.4845,
|
12960 |
+
"step": 18090
|
12961 |
+
},
|
12962 |
+
{
|
12963 |
+
"epoch": 0.45,
|
12964 |
+
"grad_norm": 3.768314838409424,
|
12965 |
+
"learning_rate": 8.067796610169492e-06,
|
12966 |
+
"loss": 1.3288,
|
12967 |
+
"step": 18100
|
12968 |
+
},
|
12969 |
+
{
|
12970 |
+
"epoch": 0.45,
|
12971 |
+
"grad_norm": 3.9936201572418213,
|
12972 |
+
"learning_rate": 8.061016949152542e-06,
|
12973 |
+
"loss": 1.2507,
|
12974 |
+
"step": 18110
|
12975 |
+
},
|
12976 |
+
{
|
12977 |
+
"epoch": 0.45,
|
12978 |
+
"grad_norm": 3.69484543800354,
|
12979 |
+
"learning_rate": 8.054237288135594e-06,
|
12980 |
+
"loss": 1.2615,
|
12981 |
+
"step": 18120
|
12982 |
+
},
|
12983 |
+
{
|
12984 |
+
"epoch": 0.45,
|
12985 |
+
"grad_norm": 6.352110862731934,
|
12986 |
+
"learning_rate": 8.047457627118645e-06,
|
12987 |
+
"loss": 1.2888,
|
12988 |
+
"step": 18130
|
12989 |
+
},
|
12990 |
+
{
|
12991 |
+
"epoch": 0.45,
|
12992 |
+
"grad_norm": 8.259808540344238,
|
12993 |
+
"learning_rate": 8.040677966101695e-06,
|
12994 |
+
"loss": 1.3588,
|
12995 |
+
"step": 18140
|
12996 |
+
},
|
12997 |
+
{
|
12998 |
+
"epoch": 0.45,
|
12999 |
+
"grad_norm": 8.203502655029297,
|
13000 |
+
"learning_rate": 8.033898305084746e-06,
|
13001 |
+
"loss": 1.2716,
|
13002 |
+
"step": 18150
|
13003 |
+
},
|
13004 |
+
{
|
13005 |
+
"epoch": 0.45,
|
13006 |
+
"grad_norm": 3.8740904331207275,
|
13007 |
+
"learning_rate": 8.027118644067797e-06,
|
13008 |
+
"loss": 1.4191,
|
13009 |
+
"step": 18160
|
13010 |
+
},
|
13011 |
+
{
|
13012 |
+
"epoch": 0.45,
|
13013 |
+
"grad_norm": 3.9104349613189697,
|
13014 |
+
"learning_rate": 8.020338983050849e-06,
|
13015 |
+
"loss": 1.1126,
|
13016 |
+
"step": 18170
|
13017 |
+
},
|
13018 |
+
{
|
13019 |
+
"epoch": 0.45,
|
13020 |
+
"grad_norm": 4.973033428192139,
|
13021 |
+
"learning_rate": 8.013559322033899e-06,
|
13022 |
+
"loss": 1.4034,
|
13023 |
+
"step": 18180
|
13024 |
+
},
|
13025 |
+
{
|
13026 |
+
"epoch": 0.45,
|
13027 |
+
"grad_norm": 5.2367329597473145,
|
13028 |
+
"learning_rate": 8.00677966101695e-06,
|
13029 |
+
"loss": 1.4094,
|
13030 |
+
"step": 18190
|
13031 |
+
},
|
13032 |
+
{
|
13033 |
+
"epoch": 0.46,
|
13034 |
+
"grad_norm": 4.842703819274902,
|
13035 |
+
"learning_rate": 8.000000000000001e-06,
|
13036 |
+
"loss": 1.4423,
|
13037 |
+
"step": 18200
|
13038 |
+
},
|
13039 |
+
{
|
13040 |
+
"epoch": 0.46,
|
13041 |
+
"grad_norm": 6.39046573638916,
|
13042 |
+
"learning_rate": 7.993220338983053e-06,
|
13043 |
+
"loss": 1.3491,
|
13044 |
+
"step": 18210
|
13045 |
+
},
|
13046 |
+
{
|
13047 |
+
"epoch": 0.46,
|
13048 |
+
"grad_norm": 3.424907922744751,
|
13049 |
+
"learning_rate": 7.986440677966102e-06,
|
13050 |
+
"loss": 1.268,
|
13051 |
+
"step": 18220
|
13052 |
+
},
|
13053 |
+
{
|
13054 |
+
"epoch": 0.46,
|
13055 |
+
"grad_norm": 4.363860607147217,
|
13056 |
+
"learning_rate": 7.979661016949152e-06,
|
13057 |
+
"loss": 1.2921,
|
13058 |
+
"step": 18230
|
13059 |
+
},
|
13060 |
+
{
|
13061 |
+
"epoch": 0.46,
|
13062 |
+
"grad_norm": 10.35586929321289,
|
13063 |
+
"learning_rate": 7.972881355932204e-06,
|
13064 |
+
"loss": 1.4456,
|
13065 |
+
"step": 18240
|
13066 |
+
},
|
13067 |
+
{
|
13068 |
+
"epoch": 0.46,
|
13069 |
+
"grad_norm": 11.094548225402832,
|
13070 |
+
"learning_rate": 7.966101694915255e-06,
|
13071 |
+
"loss": 1.5682,
|
13072 |
+
"step": 18250
|
13073 |
+
},
|
13074 |
+
{
|
13075 |
+
"epoch": 0.46,
|
13076 |
+
"grad_norm": 10.801241874694824,
|
13077 |
+
"learning_rate": 7.959322033898306e-06,
|
13078 |
+
"loss": 1.3475,
|
13079 |
+
"step": 18260
|
13080 |
+
},
|
13081 |
+
{
|
13082 |
+
"epoch": 0.46,
|
13083 |
+
"grad_norm": 3.893005132675171,
|
13084 |
+
"learning_rate": 7.952542372881356e-06,
|
13085 |
+
"loss": 1.3858,
|
13086 |
+
"step": 18270
|
13087 |
+
},
|
13088 |
+
{
|
13089 |
+
"epoch": 0.46,
|
13090 |
+
"grad_norm": 8.01429557800293,
|
13091 |
+
"learning_rate": 7.945762711864407e-06,
|
13092 |
+
"loss": 1.3695,
|
13093 |
+
"step": 18280
|
13094 |
+
},
|
13095 |
+
{
|
13096 |
+
"epoch": 0.46,
|
13097 |
+
"grad_norm": 9.507207870483398,
|
13098 |
+
"learning_rate": 7.938983050847459e-06,
|
13099 |
+
"loss": 1.2442,
|
13100 |
+
"step": 18290
|
13101 |
+
},
|
13102 |
+
{
|
13103 |
+
"epoch": 0.46,
|
13104 |
+
"grad_norm": 4.206308364868164,
|
13105 |
+
"learning_rate": 7.93220338983051e-06,
|
13106 |
+
"loss": 1.241,
|
13107 |
+
"step": 18300
|
13108 |
+
},
|
13109 |
+
{
|
13110 |
+
"epoch": 0.46,
|
13111 |
+
"grad_norm": 7.3320794105529785,
|
13112 |
+
"learning_rate": 7.92542372881356e-06,
|
13113 |
+
"loss": 1.3338,
|
13114 |
+
"step": 18310
|
13115 |
+
},
|
13116 |
+
{
|
13117 |
+
"epoch": 0.46,
|
13118 |
+
"grad_norm": 4.259428977966309,
|
13119 |
+
"learning_rate": 7.91864406779661e-06,
|
13120 |
+
"loss": 1.3902,
|
13121 |
+
"step": 18320
|
13122 |
+
},
|
13123 |
+
{
|
13124 |
+
"epoch": 0.46,
|
13125 |
+
"grad_norm": 15.748085975646973,
|
13126 |
+
"learning_rate": 7.911864406779661e-06,
|
13127 |
+
"loss": 1.282,
|
13128 |
+
"step": 18330
|
13129 |
+
},
|
13130 |
+
{
|
13131 |
+
"epoch": 0.46,
|
13132 |
+
"grad_norm": 7.123409748077393,
|
13133 |
+
"learning_rate": 7.905084745762712e-06,
|
13134 |
+
"loss": 1.3311,
|
13135 |
+
"step": 18340
|
13136 |
+
},
|
13137 |
+
{
|
13138 |
+
"epoch": 0.46,
|
13139 |
+
"grad_norm": 7.022697925567627,
|
13140 |
+
"learning_rate": 7.898305084745764e-06,
|
13141 |
+
"loss": 1.292,
|
13142 |
+
"step": 18350
|
13143 |
+
},
|
13144 |
+
{
|
13145 |
+
"epoch": 0.46,
|
13146 |
+
"grad_norm": 3.3042855262756348,
|
13147 |
+
"learning_rate": 7.891525423728814e-06,
|
13148 |
+
"loss": 1.3587,
|
13149 |
+
"step": 18360
|
13150 |
+
},
|
13151 |
+
{
|
13152 |
+
"epoch": 0.46,
|
13153 |
+
"grad_norm": 3.5053913593292236,
|
13154 |
+
"learning_rate": 7.884745762711865e-06,
|
13155 |
+
"loss": 1.5188,
|
13156 |
+
"step": 18370
|
13157 |
+
},
|
13158 |
+
{
|
13159 |
+
"epoch": 0.46,
|
13160 |
+
"grad_norm": 2.6854681968688965,
|
13161 |
+
"learning_rate": 7.877966101694916e-06,
|
13162 |
+
"loss": 1.3633,
|
13163 |
+
"step": 18380
|
13164 |
+
},
|
13165 |
+
{
|
13166 |
+
"epoch": 0.46,
|
13167 |
+
"grad_norm": 3.8879082202911377,
|
13168 |
+
"learning_rate": 7.871186440677968e-06,
|
13169 |
+
"loss": 1.5491,
|
13170 |
+
"step": 18390
|
13171 |
+
},
|
13172 |
+
{
|
13173 |
+
"epoch": 0.46,
|
13174 |
+
"grad_norm": 9.43989086151123,
|
13175 |
+
"learning_rate": 7.864406779661017e-06,
|
13176 |
+
"loss": 1.2686,
|
13177 |
+
"step": 18400
|
13178 |
+
},
|
13179 |
+
{
|
13180 |
+
"epoch": 0.46,
|
13181 |
+
"grad_norm": 4.684770107269287,
|
13182 |
+
"learning_rate": 7.857627118644069e-06,
|
13183 |
+
"loss": 1.2463,
|
13184 |
+
"step": 18410
|
13185 |
+
},
|
13186 |
+
{
|
13187 |
+
"epoch": 0.46,
|
13188 |
+
"grad_norm": 2.699023485183716,
|
13189 |
+
"learning_rate": 7.850847457627119e-06,
|
13190 |
+
"loss": 1.5216,
|
13191 |
+
"step": 18420
|
13192 |
+
},
|
13193 |
+
{
|
13194 |
+
"epoch": 0.46,
|
13195 |
+
"grad_norm": 3.1254053115844727,
|
13196 |
+
"learning_rate": 7.84406779661017e-06,
|
13197 |
+
"loss": 1.4231,
|
13198 |
+
"step": 18430
|
13199 |
+
},
|
13200 |
+
{
|
13201 |
+
"epoch": 0.46,
|
13202 |
+
"grad_norm": 3.4634742736816406,
|
13203 |
+
"learning_rate": 7.837288135593221e-06,
|
13204 |
+
"loss": 1.4174,
|
13205 |
+
"step": 18440
|
13206 |
+
},
|
13207 |
+
{
|
13208 |
+
"epoch": 0.46,
|
13209 |
+
"grad_norm": 2.7554078102111816,
|
13210 |
+
"learning_rate": 7.830508474576271e-06,
|
13211 |
+
"loss": 1.2646,
|
13212 |
+
"step": 18450
|
13213 |
+
},
|
13214 |
+
{
|
13215 |
+
"epoch": 0.46,
|
13216 |
+
"grad_norm": 7.782464504241943,
|
13217 |
+
"learning_rate": 7.823728813559322e-06,
|
13218 |
+
"loss": 1.273,
|
13219 |
+
"step": 18460
|
13220 |
+
},
|
13221 |
+
{
|
13222 |
+
"epoch": 0.46,
|
13223 |
+
"grad_norm": 12.717724800109863,
|
13224 |
+
"learning_rate": 7.816949152542374e-06,
|
13225 |
+
"loss": 1.3536,
|
13226 |
+
"step": 18470
|
13227 |
+
},
|
13228 |
+
{
|
13229 |
+
"epoch": 0.46,
|
13230 |
+
"grad_norm": 4.644845008850098,
|
13231 |
+
"learning_rate": 7.810169491525425e-06,
|
13232 |
+
"loss": 1.567,
|
13233 |
+
"step": 18480
|
13234 |
+
},
|
13235 |
+
{
|
13236 |
+
"epoch": 0.46,
|
13237 |
+
"grad_norm": 2.032013177871704,
|
13238 |
+
"learning_rate": 7.803389830508475e-06,
|
13239 |
+
"loss": 1.4428,
|
13240 |
+
"step": 18490
|
13241 |
+
},
|
13242 |
+
{
|
13243 |
+
"epoch": 0.46,
|
13244 |
+
"grad_norm": 8.91115951538086,
|
13245 |
+
"learning_rate": 7.796610169491526e-06,
|
13246 |
+
"loss": 1.4071,
|
13247 |
+
"step": 18500
|
13248 |
+
},
|
13249 |
+
{
|
13250 |
+
"epoch": 0.46,
|
13251 |
+
"eval_loss": 1.3208402395248413,
|
13252 |
+
"eval_runtime": 66.1773,
|
13253 |
+
"eval_samples_per_second": 15.111,
|
13254 |
+
"eval_steps_per_second": 15.111,
|
13255 |
+
"step": 18500
|
13256 |
+
},
|
13257 |
+
{
|
13258 |
+
"epoch": 0.46,
|
13259 |
+
"grad_norm": 4.1798248291015625,
|
13260 |
+
"learning_rate": 7.789830508474578e-06,
|
13261 |
+
"loss": 1.3752,
|
13262 |
+
"step": 18510
|
13263 |
+
},
|
13264 |
+
{
|
13265 |
+
"epoch": 0.46,
|
13266 |
+
"grad_norm": 3.6975016593933105,
|
13267 |
+
"learning_rate": 7.783050847457628e-06,
|
13268 |
+
"loss": 1.3685,
|
13269 |
+
"step": 18520
|
13270 |
+
},
|
13271 |
+
{
|
13272 |
+
"epoch": 0.46,
|
13273 |
+
"grad_norm": 4.421247959136963,
|
13274 |
+
"learning_rate": 7.776271186440679e-06,
|
13275 |
+
"loss": 1.5264,
|
13276 |
+
"step": 18530
|
13277 |
+
},
|
13278 |
+
{
|
13279 |
+
"epoch": 0.46,
|
13280 |
+
"grad_norm": 8.739845275878906,
|
13281 |
+
"learning_rate": 7.769491525423729e-06,
|
13282 |
+
"loss": 1.4609,
|
13283 |
+
"step": 18540
|
13284 |
+
},
|
13285 |
+
{
|
13286 |
+
"epoch": 0.46,
|
13287 |
+
"grad_norm": 3.287424325942993,
|
13288 |
+
"learning_rate": 7.76271186440678e-06,
|
13289 |
+
"loss": 1.1154,
|
13290 |
+
"step": 18550
|
13291 |
+
},
|
13292 |
+
{
|
13293 |
+
"epoch": 0.46,
|
13294 |
+
"grad_norm": 3.1192266941070557,
|
13295 |
+
"learning_rate": 7.755932203389831e-06,
|
13296 |
+
"loss": 1.2787,
|
13297 |
+
"step": 18560
|
13298 |
+
},
|
13299 |
+
{
|
13300 |
+
"epoch": 0.46,
|
13301 |
+
"grad_norm": 8.368815422058105,
|
13302 |
+
"learning_rate": 7.749152542372881e-06,
|
13303 |
+
"loss": 1.3339,
|
13304 |
+
"step": 18570
|
13305 |
+
},
|
13306 |
+
{
|
13307 |
+
"epoch": 0.46,
|
13308 |
+
"grad_norm": 6.241825580596924,
|
13309 |
+
"learning_rate": 7.742372881355933e-06,
|
13310 |
+
"loss": 1.2293,
|
13311 |
+
"step": 18580
|
13312 |
+
},
|
13313 |
+
{
|
13314 |
+
"epoch": 0.46,
|
13315 |
+
"grad_norm": 9.267044067382812,
|
13316 |
+
"learning_rate": 7.735593220338984e-06,
|
13317 |
+
"loss": 1.4354,
|
13318 |
+
"step": 18590
|
13319 |
+
},
|
13320 |
+
{
|
13321 |
+
"epoch": 0.47,
|
13322 |
+
"grad_norm": 5.569955348968506,
|
13323 |
+
"learning_rate": 7.728813559322035e-06,
|
13324 |
+
"loss": 1.3941,
|
13325 |
+
"step": 18600
|
13326 |
+
},
|
13327 |
+
{
|
13328 |
+
"epoch": 0.47,
|
13329 |
+
"grad_norm": 11.273942947387695,
|
13330 |
+
"learning_rate": 7.722033898305085e-06,
|
13331 |
+
"loss": 1.1127,
|
13332 |
+
"step": 18610
|
13333 |
+
},
|
13334 |
+
{
|
13335 |
+
"epoch": 0.47,
|
13336 |
+
"grad_norm": 3.7952206134796143,
|
13337 |
+
"learning_rate": 7.715254237288136e-06,
|
13338 |
+
"loss": 1.3437,
|
13339 |
+
"step": 18620
|
13340 |
+
},
|
13341 |
+
{
|
13342 |
+
"epoch": 0.47,
|
13343 |
+
"grad_norm": 2.9084484577178955,
|
13344 |
+
"learning_rate": 7.708474576271186e-06,
|
13345 |
+
"loss": 1.3646,
|
13346 |
+
"step": 18630
|
13347 |
+
},
|
13348 |
+
{
|
13349 |
+
"epoch": 0.47,
|
13350 |
+
"grad_norm": 4.15964937210083,
|
13351 |
+
"learning_rate": 7.701694915254238e-06,
|
13352 |
+
"loss": 1.4666,
|
13353 |
+
"step": 18640
|
13354 |
+
},
|
13355 |
+
{
|
13356 |
+
"epoch": 0.47,
|
13357 |
+
"grad_norm": 4.938930988311768,
|
13358 |
+
"learning_rate": 7.694915254237289e-06,
|
13359 |
+
"loss": 1.4065,
|
13360 |
+
"step": 18650
|
13361 |
+
},
|
13362 |
+
{
|
13363 |
+
"epoch": 0.47,
|
13364 |
+
"grad_norm": 8.298666954040527,
|
13365 |
+
"learning_rate": 7.688135593220339e-06,
|
13366 |
+
"loss": 1.354,
|
13367 |
+
"step": 18660
|
13368 |
+
},
|
13369 |
+
{
|
13370 |
+
"epoch": 0.47,
|
13371 |
+
"grad_norm": 3.1837921142578125,
|
13372 |
+
"learning_rate": 7.68135593220339e-06,
|
13373 |
+
"loss": 1.3879,
|
13374 |
+
"step": 18670
|
13375 |
+
},
|
13376 |
+
{
|
13377 |
+
"epoch": 0.47,
|
13378 |
+
"grad_norm": 2.978053331375122,
|
13379 |
+
"learning_rate": 7.674576271186441e-06,
|
13380 |
+
"loss": 1.3765,
|
13381 |
+
"step": 18680
|
13382 |
+
},
|
13383 |
+
{
|
13384 |
+
"epoch": 0.47,
|
13385 |
+
"grad_norm": 5.5602827072143555,
|
13386 |
+
"learning_rate": 7.667796610169493e-06,
|
13387 |
+
"loss": 1.2135,
|
13388 |
+
"step": 18690
|
13389 |
+
},
|
13390 |
+
{
|
13391 |
+
"epoch": 0.47,
|
13392 |
+
"grad_norm": 5.481113910675049,
|
13393 |
+
"learning_rate": 7.661016949152543e-06,
|
13394 |
+
"loss": 1.3841,
|
13395 |
+
"step": 18700
|
13396 |
+
},
|
13397 |
+
{
|
13398 |
+
"epoch": 0.47,
|
13399 |
+
"grad_norm": 9.419681549072266,
|
13400 |
+
"learning_rate": 7.654237288135594e-06,
|
13401 |
+
"loss": 1.2349,
|
13402 |
+
"step": 18710
|
13403 |
+
},
|
13404 |
+
{
|
13405 |
+
"epoch": 0.47,
|
13406 |
+
"grad_norm": 4.955466270446777,
|
13407 |
+
"learning_rate": 7.647457627118645e-06,
|
13408 |
+
"loss": 1.4208,
|
13409 |
+
"step": 18720
|
13410 |
+
},
|
13411 |
+
{
|
13412 |
+
"epoch": 0.47,
|
13413 |
+
"grad_norm": 5.841220855712891,
|
13414 |
+
"learning_rate": 7.640677966101695e-06,
|
13415 |
+
"loss": 1.4526,
|
13416 |
+
"step": 18730
|
13417 |
+
},
|
13418 |
+
{
|
13419 |
+
"epoch": 0.47,
|
13420 |
+
"grad_norm": 3.1412012577056885,
|
13421 |
+
"learning_rate": 7.633898305084746e-06,
|
13422 |
+
"loss": 1.4606,
|
13423 |
+
"step": 18740
|
13424 |
+
},
|
13425 |
+
{
|
13426 |
+
"epoch": 0.47,
|
13427 |
+
"grad_norm": 4.624940395355225,
|
13428 |
+
"learning_rate": 7.627118644067797e-06,
|
13429 |
+
"loss": 1.3875,
|
13430 |
+
"step": 18750
|
13431 |
+
},
|
13432 |
+
{
|
13433 |
+
"epoch": 0.47,
|
13434 |
+
"grad_norm": 9.828381538391113,
|
13435 |
+
"learning_rate": 7.6203389830508476e-06,
|
13436 |
+
"loss": 1.4079,
|
13437 |
+
"step": 18760
|
13438 |
+
},
|
13439 |
+
{
|
13440 |
+
"epoch": 0.47,
|
13441 |
+
"grad_norm": 5.299017429351807,
|
13442 |
+
"learning_rate": 7.613559322033899e-06,
|
13443 |
+
"loss": 1.278,
|
13444 |
+
"step": 18770
|
13445 |
+
},
|
13446 |
+
{
|
13447 |
+
"epoch": 0.47,
|
13448 |
+
"grad_norm": 6.449117183685303,
|
13449 |
+
"learning_rate": 7.6067796610169495e-06,
|
13450 |
+
"loss": 1.4141,
|
13451 |
+
"step": 18780
|
13452 |
+
},
|
13453 |
+
{
|
13454 |
+
"epoch": 0.47,
|
13455 |
+
"grad_norm": 6.961145401000977,
|
13456 |
+
"learning_rate": 7.600000000000001e-06,
|
13457 |
+
"loss": 1.2828,
|
13458 |
+
"step": 18790
|
13459 |
+
},
|
13460 |
+
{
|
13461 |
+
"epoch": 0.47,
|
13462 |
+
"grad_norm": 8.588834762573242,
|
13463 |
+
"learning_rate": 7.5932203389830515e-06,
|
13464 |
+
"loss": 1.3544,
|
13465 |
+
"step": 18800
|
13466 |
+
},
|
13467 |
+
{
|
13468 |
+
"epoch": 0.47,
|
13469 |
+
"grad_norm": 2.5683794021606445,
|
13470 |
+
"learning_rate": 7.586440677966103e-06,
|
13471 |
+
"loss": 1.4127,
|
13472 |
+
"step": 18810
|
13473 |
+
},
|
13474 |
+
{
|
13475 |
+
"epoch": 0.47,
|
13476 |
+
"grad_norm": 2.101924180984497,
|
13477 |
+
"learning_rate": 7.5796610169491534e-06,
|
13478 |
+
"loss": 1.229,
|
13479 |
+
"step": 18820
|
13480 |
+
},
|
13481 |
+
{
|
13482 |
+
"epoch": 0.47,
|
13483 |
+
"grad_norm": 5.389444351196289,
|
13484 |
+
"learning_rate": 7.572881355932205e-06,
|
13485 |
+
"loss": 1.3112,
|
13486 |
+
"step": 18830
|
13487 |
+
},
|
13488 |
+
{
|
13489 |
+
"epoch": 0.47,
|
13490 |
+
"grad_norm": 3.154507875442505,
|
13491 |
+
"learning_rate": 7.5661016949152545e-06,
|
13492 |
+
"loss": 1.2603,
|
13493 |
+
"step": 18840
|
13494 |
+
},
|
13495 |
+
{
|
13496 |
+
"epoch": 0.47,
|
13497 |
+
"grad_norm": 4.938506126403809,
|
13498 |
+
"learning_rate": 7.559322033898305e-06,
|
13499 |
+
"loss": 1.5423,
|
13500 |
+
"step": 18850
|
13501 |
+
},
|
13502 |
+
{
|
13503 |
+
"epoch": 0.47,
|
13504 |
+
"grad_norm": 5.838810443878174,
|
13505 |
+
"learning_rate": 7.5525423728813565e-06,
|
13506 |
+
"loss": 1.0651,
|
13507 |
+
"step": 18860
|
13508 |
+
},
|
13509 |
+
{
|
13510 |
+
"epoch": 0.47,
|
13511 |
+
"grad_norm": 7.946393013000488,
|
13512 |
+
"learning_rate": 7.545762711864407e-06,
|
13513 |
+
"loss": 1.4706,
|
13514 |
+
"step": 18870
|
13515 |
+
},
|
13516 |
+
{
|
13517 |
+
"epoch": 0.47,
|
13518 |
+
"grad_norm": 12.6395902633667,
|
13519 |
+
"learning_rate": 7.5389830508474584e-06,
|
13520 |
+
"loss": 1.4036,
|
13521 |
+
"step": 18880
|
13522 |
+
},
|
13523 |
+
{
|
13524 |
+
"epoch": 0.47,
|
13525 |
+
"grad_norm": 4.769916534423828,
|
13526 |
+
"learning_rate": 7.532203389830509e-06,
|
13527 |
+
"loss": 1.33,
|
13528 |
+
"step": 18890
|
13529 |
+
},
|
13530 |
+
{
|
13531 |
+
"epoch": 0.47,
|
13532 |
+
"grad_norm": 5.928445816040039,
|
13533 |
+
"learning_rate": 7.52542372881356e-06,
|
13534 |
+
"loss": 1.4844,
|
13535 |
+
"step": 18900
|
13536 |
+
},
|
13537 |
+
{
|
13538 |
+
"epoch": 0.47,
|
13539 |
+
"grad_norm": 4.121998310089111,
|
13540 |
+
"learning_rate": 7.518644067796611e-06,
|
13541 |
+
"loss": 1.2672,
|
13542 |
+
"step": 18910
|
13543 |
+
},
|
13544 |
+
{
|
13545 |
+
"epoch": 0.47,
|
13546 |
+
"grad_norm": 1.256914496421814,
|
13547 |
+
"learning_rate": 7.511864406779662e-06,
|
13548 |
+
"loss": 1.337,
|
13549 |
+
"step": 18920
|
13550 |
+
},
|
13551 |
+
{
|
13552 |
+
"epoch": 0.47,
|
13553 |
+
"grad_norm": 9.951703071594238,
|
13554 |
+
"learning_rate": 7.505084745762713e-06,
|
13555 |
+
"loss": 1.3359,
|
13556 |
+
"step": 18930
|
13557 |
+
},
|
13558 |
+
{
|
13559 |
+
"epoch": 0.47,
|
13560 |
+
"grad_norm": 5.373715400695801,
|
13561 |
+
"learning_rate": 7.498305084745763e-06,
|
13562 |
+
"loss": 1.2437,
|
13563 |
+
"step": 18940
|
13564 |
+
},
|
13565 |
+
{
|
13566 |
+
"epoch": 0.47,
|
13567 |
+
"grad_norm": 7.120504379272461,
|
13568 |
+
"learning_rate": 7.491525423728814e-06,
|
13569 |
+
"loss": 1.1883,
|
13570 |
+
"step": 18950
|
13571 |
+
},
|
13572 |
+
{
|
13573 |
+
"epoch": 0.47,
|
13574 |
+
"grad_norm": 7.661159992218018,
|
13575 |
+
"learning_rate": 7.4847457627118646e-06,
|
13576 |
+
"loss": 1.2648,
|
13577 |
+
"step": 18960
|
13578 |
+
},
|
13579 |
+
{
|
13580 |
+
"epoch": 0.47,
|
13581 |
+
"grad_norm": 5.859286308288574,
|
13582 |
+
"learning_rate": 7.477966101694916e-06,
|
13583 |
+
"loss": 1.3106,
|
13584 |
+
"step": 18970
|
13585 |
+
},
|
13586 |
+
{
|
13587 |
+
"epoch": 0.47,
|
13588 |
+
"grad_norm": 7.276169300079346,
|
13589 |
+
"learning_rate": 7.4711864406779665e-06,
|
13590 |
+
"loss": 1.393,
|
13591 |
+
"step": 18980
|
13592 |
+
},
|
13593 |
+
{
|
13594 |
+
"epoch": 0.47,
|
13595 |
+
"grad_norm": 6.31447172164917,
|
13596 |
+
"learning_rate": 7.464406779661018e-06,
|
13597 |
+
"loss": 1.4051,
|
13598 |
+
"step": 18990
|
13599 |
+
},
|
13600 |
+
{
|
13601 |
+
"epoch": 0.47,
|
13602 |
+
"grad_norm": 8.36728572845459,
|
13603 |
+
"learning_rate": 7.4576271186440685e-06,
|
13604 |
+
"loss": 1.4003,
|
13605 |
+
"step": 19000
|
13606 |
+
},
|
13607 |
+
{
|
13608 |
+
"epoch": 0.47,
|
13609 |
+
"eval_loss": 1.347457766532898,
|
13610 |
+
"eval_runtime": 66.1461,
|
13611 |
+
"eval_samples_per_second": 15.118,
|
13612 |
+
"eval_steps_per_second": 15.118,
|
13613 |
+
"step": 19000
|
13614 |
+
},
|
13615 |
+
{
|
13616 |
+
"epoch": 0.48,
|
13617 |
+
"grad_norm": 3.911505937576294,
|
13618 |
+
"learning_rate": 7.45084745762712e-06,
|
13619 |
+
"loss": 1.2646,
|
13620 |
+
"step": 19010
|
13621 |
+
},
|
13622 |
+
{
|
13623 |
+
"epoch": 0.48,
|
13624 |
+
"grad_norm": 3.6765291690826416,
|
13625 |
+
"learning_rate": 7.4440677966101704e-06,
|
13626 |
+
"loss": 1.3827,
|
13627 |
+
"step": 19020
|
13628 |
+
},
|
13629 |
+
{
|
13630 |
+
"epoch": 0.48,
|
13631 |
+
"grad_norm": 3.899599313735962,
|
13632 |
+
"learning_rate": 7.437288135593221e-06,
|
13633 |
+
"loss": 1.4144,
|
13634 |
+
"step": 19030
|
13635 |
+
},
|
13636 |
+
{
|
13637 |
+
"epoch": 0.48,
|
13638 |
+
"grad_norm": 6.436791896820068,
|
13639 |
+
"learning_rate": 7.430508474576272e-06,
|
13640 |
+
"loss": 1.2087,
|
13641 |
+
"step": 19040
|
13642 |
+
},
|
13643 |
+
{
|
13644 |
+
"epoch": 0.48,
|
13645 |
+
"grad_norm": 4.624211311340332,
|
13646 |
+
"learning_rate": 7.423728813559322e-06,
|
13647 |
+
"loss": 1.3824,
|
13648 |
+
"step": 19050
|
13649 |
+
},
|
13650 |
+
{
|
13651 |
+
"epoch": 0.48,
|
13652 |
+
"grad_norm": 6.657593727111816,
|
13653 |
+
"learning_rate": 7.4169491525423735e-06,
|
13654 |
+
"loss": 1.3856,
|
13655 |
+
"step": 19060
|
13656 |
+
},
|
13657 |
+
{
|
13658 |
+
"epoch": 0.48,
|
13659 |
+
"grad_norm": 7.132912635803223,
|
13660 |
+
"learning_rate": 7.410169491525424e-06,
|
13661 |
+
"loss": 1.4319,
|
13662 |
+
"step": 19070
|
13663 |
+
},
|
13664 |
+
{
|
13665 |
+
"epoch": 0.48,
|
13666 |
+
"grad_norm": 2.8681843280792236,
|
13667 |
+
"learning_rate": 7.4033898305084754e-06,
|
13668 |
+
"loss": 1.419,
|
13669 |
+
"step": 19080
|
13670 |
+
},
|
13671 |
+
{
|
13672 |
+
"epoch": 0.48,
|
13673 |
+
"grad_norm": 5.819919109344482,
|
13674 |
+
"learning_rate": 7.396610169491526e-06,
|
13675 |
+
"loss": 1.3988,
|
13676 |
+
"step": 19090
|
13677 |
+
},
|
13678 |
+
{
|
13679 |
+
"epoch": 0.48,
|
13680 |
+
"grad_norm": 6.482515811920166,
|
13681 |
+
"learning_rate": 7.3898305084745766e-06,
|
13682 |
+
"loss": 1.3366,
|
13683 |
+
"step": 19100
|
13684 |
+
},
|
13685 |
+
{
|
13686 |
+
"epoch": 0.48,
|
13687 |
+
"grad_norm": 10.341208457946777,
|
13688 |
+
"learning_rate": 7.383050847457628e-06,
|
13689 |
+
"loss": 1.3502,
|
13690 |
+
"step": 19110
|
13691 |
+
},
|
13692 |
+
{
|
13693 |
+
"epoch": 0.48,
|
13694 |
+
"grad_norm": 2.820133686065674,
|
13695 |
+
"learning_rate": 7.3762711864406785e-06,
|
13696 |
+
"loss": 1.415,
|
13697 |
+
"step": 19120
|
13698 |
+
},
|
13699 |
+
{
|
13700 |
+
"epoch": 0.48,
|
13701 |
+
"grad_norm": 7.112204551696777,
|
13702 |
+
"learning_rate": 7.36949152542373e-06,
|
13703 |
+
"loss": 1.2609,
|
13704 |
+
"step": 19130
|
13705 |
+
},
|
13706 |
+
{
|
13707 |
+
"epoch": 0.48,
|
13708 |
+
"grad_norm": 6.172835826873779,
|
13709 |
+
"learning_rate": 7.3627118644067805e-06,
|
13710 |
+
"loss": 1.2456,
|
13711 |
+
"step": 19140
|
13712 |
+
},
|
13713 |
+
{
|
13714 |
+
"epoch": 0.48,
|
13715 |
+
"grad_norm": 2.9154605865478516,
|
13716 |
+
"learning_rate": 7.355932203389831e-06,
|
13717 |
+
"loss": 1.3332,
|
13718 |
+
"step": 19150
|
13719 |
+
},
|
13720 |
+
{
|
13721 |
+
"epoch": 0.48,
|
13722 |
+
"grad_norm": 6.8542256355285645,
|
13723 |
+
"learning_rate": 7.3491525423728816e-06,
|
13724 |
+
"loss": 1.2728,
|
13725 |
+
"step": 19160
|
13726 |
+
},
|
13727 |
+
{
|
13728 |
+
"epoch": 0.48,
|
13729 |
+
"grad_norm": 7.636801719665527,
|
13730 |
+
"learning_rate": 7.342372881355932e-06,
|
13731 |
+
"loss": 1.3744,
|
13732 |
+
"step": 19170
|
13733 |
+
},
|
13734 |
+
{
|
13735 |
+
"epoch": 0.48,
|
13736 |
+
"grad_norm": 7.556679725646973,
|
13737 |
+
"learning_rate": 7.3355932203389835e-06,
|
13738 |
+
"loss": 1.446,
|
13739 |
+
"step": 19180
|
13740 |
+
},
|
13741 |
+
{
|
13742 |
+
"epoch": 0.48,
|
13743 |
+
"grad_norm": 22.831199645996094,
|
13744 |
+
"learning_rate": 7.328813559322034e-06,
|
13745 |
+
"loss": 1.358,
|
13746 |
+
"step": 19190
|
13747 |
+
},
|
13748 |
+
{
|
13749 |
+
"epoch": 0.48,
|
13750 |
+
"grad_norm": 10.184800148010254,
|
13751 |
+
"learning_rate": 7.3220338983050855e-06,
|
13752 |
+
"loss": 1.2379,
|
13753 |
+
"step": 19200
|
13754 |
+
},
|
13755 |
+
{
|
13756 |
+
"epoch": 0.48,
|
13757 |
+
"grad_norm": 1.9062511920928955,
|
13758 |
+
"learning_rate": 7.315254237288136e-06,
|
13759 |
+
"loss": 1.3724,
|
13760 |
+
"step": 19210
|
13761 |
+
},
|
13762 |
+
{
|
13763 |
+
"epoch": 0.48,
|
13764 |
+
"grad_norm": 9.429797172546387,
|
13765 |
+
"learning_rate": 7.3084745762711874e-06,
|
13766 |
+
"loss": 1.33,
|
13767 |
+
"step": 19220
|
13768 |
+
},
|
13769 |
+
{
|
13770 |
+
"epoch": 0.48,
|
13771 |
+
"grad_norm": 3.129505157470703,
|
13772 |
+
"learning_rate": 7.301694915254238e-06,
|
13773 |
+
"loss": 1.1795,
|
13774 |
+
"step": 19230
|
13775 |
+
},
|
13776 |
+
{
|
13777 |
+
"epoch": 0.48,
|
13778 |
+
"grad_norm": 6.839028835296631,
|
13779 |
+
"learning_rate": 7.294915254237289e-06,
|
13780 |
+
"loss": 1.2983,
|
13781 |
+
"step": 19240
|
13782 |
+
},
|
13783 |
+
{
|
13784 |
+
"epoch": 0.48,
|
13785 |
+
"grad_norm": 15.552626609802246,
|
13786 |
+
"learning_rate": 7.288135593220339e-06,
|
13787 |
+
"loss": 1.4867,
|
13788 |
+
"step": 19250
|
13789 |
+
},
|
13790 |
+
{
|
13791 |
+
"epoch": 0.48,
|
13792 |
+
"grad_norm": 3.4032084941864014,
|
13793 |
+
"learning_rate": 7.28135593220339e-06,
|
13794 |
+
"loss": 1.4256,
|
13795 |
+
"step": 19260
|
13796 |
+
},
|
13797 |
+
{
|
13798 |
+
"epoch": 0.48,
|
13799 |
+
"grad_norm": 6.437047481536865,
|
13800 |
+
"learning_rate": 7.274576271186441e-06,
|
13801 |
+
"loss": 1.3215,
|
13802 |
+
"step": 19270
|
13803 |
+
},
|
13804 |
+
{
|
13805 |
+
"epoch": 0.48,
|
13806 |
+
"grad_norm": 5.592834949493408,
|
13807 |
+
"learning_rate": 7.267796610169492e-06,
|
13808 |
+
"loss": 1.3466,
|
13809 |
+
"step": 19280
|
13810 |
+
},
|
13811 |
+
{
|
13812 |
+
"epoch": 0.48,
|
13813 |
+
"grad_norm": 10.394824028015137,
|
13814 |
+
"learning_rate": 7.261016949152543e-06,
|
13815 |
+
"loss": 1.3806,
|
13816 |
+
"step": 19290
|
13817 |
+
},
|
13818 |
+
{
|
13819 |
+
"epoch": 0.48,
|
13820 |
+
"grad_norm": 6.921483039855957,
|
13821 |
+
"learning_rate": 7.2542372881355936e-06,
|
13822 |
+
"loss": 1.5624,
|
13823 |
+
"step": 19300
|
13824 |
+
},
|
13825 |
+
{
|
13826 |
+
"epoch": 0.48,
|
13827 |
+
"grad_norm": 3.078634262084961,
|
13828 |
+
"learning_rate": 7.247457627118645e-06,
|
13829 |
+
"loss": 1.1216,
|
13830 |
+
"step": 19310
|
13831 |
+
},
|
13832 |
+
{
|
13833 |
+
"epoch": 0.48,
|
13834 |
+
"grad_norm": 6.003988742828369,
|
13835 |
+
"learning_rate": 7.2406779661016955e-06,
|
13836 |
+
"loss": 1.2759,
|
13837 |
+
"step": 19320
|
13838 |
+
},
|
13839 |
+
{
|
13840 |
+
"epoch": 0.48,
|
13841 |
+
"grad_norm": 2.419410467147827,
|
13842 |
+
"learning_rate": 7.233898305084747e-06,
|
13843 |
+
"loss": 1.2067,
|
13844 |
+
"step": 19330
|
13845 |
+
},
|
13846 |
+
{
|
13847 |
+
"epoch": 0.48,
|
13848 |
+
"grad_norm": 3.014220952987671,
|
13849 |
+
"learning_rate": 7.2271186440677975e-06,
|
13850 |
+
"loss": 1.1819,
|
13851 |
+
"step": 19340
|
13852 |
+
},
|
13853 |
+
{
|
13854 |
+
"epoch": 0.48,
|
13855 |
+
"grad_norm": 5.271297454833984,
|
13856 |
+
"learning_rate": 7.220338983050849e-06,
|
13857 |
+
"loss": 1.4742,
|
13858 |
+
"step": 19350
|
13859 |
+
},
|
13860 |
+
{
|
13861 |
+
"epoch": 0.48,
|
13862 |
+
"grad_norm": 2.456838607788086,
|
13863 |
+
"learning_rate": 7.2135593220338986e-06,
|
13864 |
+
"loss": 1.4656,
|
13865 |
+
"step": 19360
|
13866 |
+
},
|
13867 |
+
{
|
13868 |
+
"epoch": 0.48,
|
13869 |
+
"grad_norm": 5.350526332855225,
|
13870 |
+
"learning_rate": 7.206779661016949e-06,
|
13871 |
+
"loss": 1.1238,
|
13872 |
+
"step": 19370
|
13873 |
+
},
|
13874 |
+
{
|
13875 |
+
"epoch": 0.48,
|
13876 |
+
"grad_norm": 5.42751932144165,
|
13877 |
+
"learning_rate": 7.2000000000000005e-06,
|
13878 |
+
"loss": 1.4704,
|
13879 |
+
"step": 19380
|
13880 |
+
},
|
13881 |
+
{
|
13882 |
+
"epoch": 0.48,
|
13883 |
+
"grad_norm": 3.97116756439209,
|
13884 |
+
"learning_rate": 7.193220338983051e-06,
|
13885 |
+
"loss": 1.4455,
|
13886 |
+
"step": 19390
|
13887 |
+
},
|
13888 |
+
{
|
13889 |
+
"epoch": 0.48,
|
13890 |
+
"grad_norm": 5.157166481018066,
|
13891 |
+
"learning_rate": 7.1864406779661025e-06,
|
13892 |
+
"loss": 1.3725,
|
13893 |
+
"step": 19400
|
13894 |
+
},
|
13895 |
+
{
|
13896 |
+
"epoch": 0.49,
|
13897 |
+
"grad_norm": 4.2230072021484375,
|
13898 |
+
"learning_rate": 7.179661016949153e-06,
|
13899 |
+
"loss": 1.3768,
|
13900 |
+
"step": 19410
|
13901 |
+
},
|
13902 |
+
{
|
13903 |
+
"epoch": 0.49,
|
13904 |
+
"grad_norm": 5.513181686401367,
|
13905 |
+
"learning_rate": 7.1728813559322044e-06,
|
13906 |
+
"loss": 1.3018,
|
13907 |
+
"step": 19420
|
13908 |
+
},
|
13909 |
+
{
|
13910 |
+
"epoch": 0.49,
|
13911 |
+
"grad_norm": 7.353570461273193,
|
13912 |
+
"learning_rate": 7.166101694915255e-06,
|
13913 |
+
"loss": 1.513,
|
13914 |
+
"step": 19430
|
13915 |
+
},
|
13916 |
+
{
|
13917 |
+
"epoch": 0.49,
|
13918 |
+
"grad_norm": 6.263894081115723,
|
13919 |
+
"learning_rate": 7.159322033898306e-06,
|
13920 |
+
"loss": 1.4521,
|
13921 |
+
"step": 19440
|
13922 |
+
},
|
13923 |
+
{
|
13924 |
+
"epoch": 0.49,
|
13925 |
+
"grad_norm": 13.368189811706543,
|
13926 |
+
"learning_rate": 7.152542372881357e-06,
|
13927 |
+
"loss": 1.3158,
|
13928 |
+
"step": 19450
|
13929 |
+
},
|
13930 |
+
{
|
13931 |
+
"epoch": 0.49,
|
13932 |
+
"grad_norm": 4.644199371337891,
|
13933 |
+
"learning_rate": 7.145762711864407e-06,
|
13934 |
+
"loss": 1.4184,
|
13935 |
+
"step": 19460
|
13936 |
+
},
|
13937 |
+
{
|
13938 |
+
"epoch": 0.49,
|
13939 |
+
"grad_norm": 5.206189155578613,
|
13940 |
+
"learning_rate": 7.138983050847458e-06,
|
13941 |
+
"loss": 1.3848,
|
13942 |
+
"step": 19470
|
13943 |
+
},
|
13944 |
+
{
|
13945 |
+
"epoch": 0.49,
|
13946 |
+
"grad_norm": 7.3628153800964355,
|
13947 |
+
"learning_rate": 7.132203389830509e-06,
|
13948 |
+
"loss": 1.2512,
|
13949 |
+
"step": 19480
|
13950 |
+
},
|
13951 |
+
{
|
13952 |
+
"epoch": 0.49,
|
13953 |
+
"grad_norm": 7.519322395324707,
|
13954 |
+
"learning_rate": 7.12542372881356e-06,
|
13955 |
+
"loss": 1.261,
|
13956 |
+
"step": 19490
|
13957 |
+
},
|
13958 |
+
{
|
13959 |
+
"epoch": 0.49,
|
13960 |
+
"grad_norm": 1.5350415706634521,
|
13961 |
+
"learning_rate": 7.1186440677966106e-06,
|
13962 |
+
"loss": 1.2517,
|
13963 |
+
"step": 19500
|
13964 |
+
},
|
13965 |
+
{
|
13966 |
+
"epoch": 0.49,
|
13967 |
+
"eval_loss": 1.3194345235824585,
|
13968 |
+
"eval_runtime": 66.1305,
|
13969 |
+
"eval_samples_per_second": 15.122,
|
13970 |
+
"eval_steps_per_second": 15.122,
|
13971 |
+
"step": 19500
|
13972 |
+
},
|
13973 |
+
{
|
13974 |
+
"epoch": 0.49,
|
13975 |
+
"grad_norm": 3.0648765563964844,
|
13976 |
+
"learning_rate": 7.111864406779662e-06,
|
13977 |
+
"loss": 1.398,
|
13978 |
+
"step": 19510
|
13979 |
+
},
|
13980 |
+
{
|
13981 |
+
"epoch": 0.49,
|
13982 |
+
"grad_norm": 5.124095439910889,
|
13983 |
+
"learning_rate": 7.1050847457627125e-06,
|
13984 |
+
"loss": 1.404,
|
13985 |
+
"step": 19520
|
13986 |
+
},
|
13987 |
+
{
|
13988 |
+
"epoch": 0.49,
|
13989 |
+
"grad_norm": 2.8961544036865234,
|
13990 |
+
"learning_rate": 7.098305084745764e-06,
|
13991 |
+
"loss": 1.2651,
|
13992 |
+
"step": 19530
|
13993 |
+
},
|
13994 |
+
{
|
13995 |
+
"epoch": 0.49,
|
13996 |
+
"grad_norm": 3.0641605854034424,
|
13997 |
+
"learning_rate": 7.0915254237288145e-06,
|
13998 |
+
"loss": 1.3587,
|
13999 |
+
"step": 19540
|
14000 |
+
},
|
14001 |
+
{
|
14002 |
+
"epoch": 0.49,
|
14003 |
+
"grad_norm": 3.618454933166504,
|
14004 |
+
"learning_rate": 7.084745762711865e-06,
|
14005 |
+
"loss": 1.3533,
|
14006 |
+
"step": 19550
|
14007 |
+
},
|
14008 |
+
{
|
14009 |
+
"epoch": 0.49,
|
14010 |
+
"grad_norm": 3.3083536624908447,
|
14011 |
+
"learning_rate": 7.077966101694916e-06,
|
14012 |
+
"loss": 1.2859,
|
14013 |
+
"step": 19560
|
14014 |
+
},
|
14015 |
+
{
|
14016 |
+
"epoch": 0.49,
|
14017 |
+
"grad_norm": 1.260048508644104,
|
14018 |
+
"learning_rate": 7.071186440677966e-06,
|
14019 |
+
"loss": 1.2224,
|
14020 |
+
"step": 19570
|
14021 |
+
},
|
14022 |
+
{
|
14023 |
+
"epoch": 0.49,
|
14024 |
+
"grad_norm": 5.410586833953857,
|
14025 |
+
"learning_rate": 7.0644067796610175e-06,
|
14026 |
+
"loss": 1.2916,
|
14027 |
+
"step": 19580
|
14028 |
+
},
|
14029 |
+
{
|
14030 |
+
"epoch": 0.49,
|
14031 |
+
"grad_norm": 1.9295905828475952,
|
14032 |
+
"learning_rate": 7.057627118644068e-06,
|
14033 |
+
"loss": 1.3004,
|
14034 |
+
"step": 19590
|
14035 |
+
},
|
14036 |
+
{
|
14037 |
+
"epoch": 0.49,
|
14038 |
+
"grad_norm": 10.48406982421875,
|
14039 |
+
"learning_rate": 7.0508474576271195e-06,
|
14040 |
+
"loss": 1.3105,
|
14041 |
+
"step": 19600
|
14042 |
+
},
|
14043 |
+
{
|
14044 |
+
"epoch": 0.49,
|
14045 |
+
"grad_norm": 4.99778413772583,
|
14046 |
+
"learning_rate": 7.04406779661017e-06,
|
14047 |
+
"loss": 1.2481,
|
14048 |
+
"step": 19610
|
14049 |
+
},
|
14050 |
+
{
|
14051 |
+
"epoch": 0.49,
|
14052 |
+
"grad_norm": 5.476469993591309,
|
14053 |
+
"learning_rate": 7.037288135593221e-06,
|
14054 |
+
"loss": 1.345,
|
14055 |
+
"step": 19620
|
14056 |
+
},
|
14057 |
+
{
|
14058 |
+
"epoch": 0.49,
|
14059 |
+
"grad_norm": 7.250665664672852,
|
14060 |
+
"learning_rate": 7.030508474576272e-06,
|
14061 |
+
"loss": 1.2993,
|
14062 |
+
"step": 19630
|
14063 |
+
},
|
14064 |
+
{
|
14065 |
+
"epoch": 0.49,
|
14066 |
+
"grad_norm": 7.802820682525635,
|
14067 |
+
"learning_rate": 7.0237288135593225e-06,
|
14068 |
+
"loss": 1.3106,
|
14069 |
+
"step": 19640
|
14070 |
+
},
|
14071 |
+
{
|
14072 |
+
"epoch": 0.49,
|
14073 |
+
"grad_norm": 2.5077409744262695,
|
14074 |
+
"learning_rate": 7.016949152542374e-06,
|
14075 |
+
"loss": 1.1298,
|
14076 |
+
"step": 19650
|
14077 |
+
},
|
14078 |
+
{
|
14079 |
+
"epoch": 0.49,
|
14080 |
+
"grad_norm": 2.3012547492980957,
|
14081 |
+
"learning_rate": 7.0101694915254245e-06,
|
14082 |
+
"loss": 1.2944,
|
14083 |
+
"step": 19660
|
14084 |
+
},
|
14085 |
+
{
|
14086 |
+
"epoch": 0.49,
|
14087 |
+
"grad_norm": 10.731738090515137,
|
14088 |
+
"learning_rate": 7.003389830508475e-06,
|
14089 |
+
"loss": 1.2545,
|
14090 |
+
"step": 19670
|
14091 |
+
},
|
14092 |
+
{
|
14093 |
+
"epoch": 0.49,
|
14094 |
+
"grad_norm": 11.851224899291992,
|
14095 |
+
"learning_rate": 6.996610169491526e-06,
|
14096 |
+
"loss": 1.3364,
|
14097 |
+
"step": 19680
|
14098 |
+
},
|
14099 |
+
{
|
14100 |
+
"epoch": 0.49,
|
14101 |
+
"grad_norm": 2.060750722885132,
|
14102 |
+
"learning_rate": 6.989830508474576e-06,
|
14103 |
+
"loss": 1.2909,
|
14104 |
+
"step": 19690
|
14105 |
+
},
|
14106 |
+
{
|
14107 |
+
"epoch": 0.49,
|
14108 |
+
"grad_norm": 8.549054145812988,
|
14109 |
+
"learning_rate": 6.9830508474576275e-06,
|
14110 |
+
"loss": 1.405,
|
14111 |
+
"step": 19700
|
14112 |
+
},
|
14113 |
+
{
|
14114 |
+
"epoch": 0.49,
|
14115 |
+
"grad_norm": 5.37472677230835,
|
14116 |
+
"learning_rate": 6.976271186440678e-06,
|
14117 |
+
"loss": 1.5615,
|
14118 |
+
"step": 19710
|
14119 |
+
},
|
14120 |
+
{
|
14121 |
+
"epoch": 0.49,
|
14122 |
+
"grad_norm": 4.753462314605713,
|
14123 |
+
"learning_rate": 6.9694915254237295e-06,
|
14124 |
+
"loss": 1.362,
|
14125 |
+
"step": 19720
|
14126 |
+
},
|
14127 |
+
{
|
14128 |
+
"epoch": 0.49,
|
14129 |
+
"grad_norm": 6.330374717712402,
|
14130 |
+
"learning_rate": 6.96271186440678e-06,
|
14131 |
+
"loss": 1.3563,
|
14132 |
+
"step": 19730
|
14133 |
+
},
|
14134 |
+
{
|
14135 |
+
"epoch": 0.49,
|
14136 |
+
"grad_norm": 6.962733268737793,
|
14137 |
+
"learning_rate": 6.9559322033898315e-06,
|
14138 |
+
"loss": 1.3272,
|
14139 |
+
"step": 19740
|
14140 |
+
},
|
14141 |
+
{
|
14142 |
+
"epoch": 0.49,
|
14143 |
+
"grad_norm": 13.372142791748047,
|
14144 |
+
"learning_rate": 6.949152542372882e-06,
|
14145 |
+
"loss": 1.4422,
|
14146 |
+
"step": 19750
|
14147 |
+
},
|
14148 |
+
{
|
14149 |
+
"epoch": 0.49,
|
14150 |
+
"grad_norm": 4.561896800994873,
|
14151 |
+
"learning_rate": 6.942372881355933e-06,
|
14152 |
+
"loss": 1.3691,
|
14153 |
+
"step": 19760
|
14154 |
+
},
|
14155 |
+
{
|
14156 |
+
"epoch": 0.49,
|
14157 |
+
"grad_norm": 2.2547922134399414,
|
14158 |
+
"learning_rate": 6.935593220338983e-06,
|
14159 |
+
"loss": 1.4905,
|
14160 |
+
"step": 19770
|
14161 |
+
},
|
14162 |
+
{
|
14163 |
+
"epoch": 0.49,
|
14164 |
+
"grad_norm": 4.15011739730835,
|
14165 |
+
"learning_rate": 6.928813559322034e-06,
|
14166 |
+
"loss": 1.3899,
|
14167 |
+
"step": 19780
|
14168 |
+
},
|
14169 |
+
{
|
14170 |
+
"epoch": 0.49,
|
14171 |
+
"grad_norm": 11.278037071228027,
|
14172 |
+
"learning_rate": 6.922033898305085e-06,
|
14173 |
+
"loss": 1.0803,
|
14174 |
+
"step": 19790
|
14175 |
+
},
|
14176 |
+
{
|
14177 |
+
"epoch": 0.49,
|
14178 |
+
"grad_norm": 9.100043296813965,
|
14179 |
+
"learning_rate": 6.915254237288136e-06,
|
14180 |
+
"loss": 1.4593,
|
14181 |
+
"step": 19800
|
14182 |
+
},
|
14183 |
+
{
|
14184 |
+
"epoch": 0.5,
|
14185 |
+
"grad_norm": 10.643383026123047,
|
14186 |
+
"learning_rate": 6.908474576271187e-06,
|
14187 |
+
"loss": 1.3571,
|
14188 |
+
"step": 19810
|
14189 |
+
},
|
14190 |
+
{
|
14191 |
+
"epoch": 0.5,
|
14192 |
+
"grad_norm": 11.314017295837402,
|
14193 |
+
"learning_rate": 6.901694915254238e-06,
|
14194 |
+
"loss": 1.2376,
|
14195 |
+
"step": 19820
|
14196 |
+
},
|
14197 |
+
{
|
14198 |
+
"epoch": 0.5,
|
14199 |
+
"grad_norm": 3.855220317840576,
|
14200 |
+
"learning_rate": 6.894915254237289e-06,
|
14201 |
+
"loss": 1.1095,
|
14202 |
+
"step": 19830
|
14203 |
+
},
|
14204 |
+
{
|
14205 |
+
"epoch": 0.5,
|
14206 |
+
"grad_norm": 5.896849155426025,
|
14207 |
+
"learning_rate": 6.8881355932203395e-06,
|
14208 |
+
"loss": 1.2078,
|
14209 |
+
"step": 19840
|
14210 |
+
},
|
14211 |
+
{
|
14212 |
+
"epoch": 0.5,
|
14213 |
+
"grad_norm": 2.1142632961273193,
|
14214 |
+
"learning_rate": 6.881355932203391e-06,
|
14215 |
+
"loss": 1.3017,
|
14216 |
+
"step": 19850
|
14217 |
+
},
|
14218 |
+
{
|
14219 |
+
"epoch": 0.5,
|
14220 |
+
"grad_norm": 7.116094589233398,
|
14221 |
+
"learning_rate": 6.8745762711864415e-06,
|
14222 |
+
"loss": 1.217,
|
14223 |
+
"step": 19860
|
14224 |
+
},
|
14225 |
+
{
|
14226 |
+
"epoch": 0.5,
|
14227 |
+
"grad_norm": 3.392282724380493,
|
14228 |
+
"learning_rate": 6.867796610169493e-06,
|
14229 |
+
"loss": 1.4027,
|
14230 |
+
"step": 19870
|
14231 |
+
},
|
14232 |
+
{
|
14233 |
+
"epoch": 0.5,
|
14234 |
+
"grad_norm": 5.760110855102539,
|
14235 |
+
"learning_rate": 6.861016949152543e-06,
|
14236 |
+
"loss": 1.2888,
|
14237 |
+
"step": 19880
|
14238 |
+
},
|
14239 |
+
{
|
14240 |
+
"epoch": 0.5,
|
14241 |
+
"grad_norm": 2.312904119491577,
|
14242 |
+
"learning_rate": 6.854237288135593e-06,
|
14243 |
+
"loss": 1.39,
|
14244 |
+
"step": 19890
|
14245 |
+
},
|
14246 |
+
{
|
14247 |
+
"epoch": 0.5,
|
14248 |
+
"grad_norm": 9.748230934143066,
|
14249 |
+
"learning_rate": 6.8474576271186445e-06,
|
14250 |
+
"loss": 1.408,
|
14251 |
+
"step": 19900
|
14252 |
+
},
|
14253 |
+
{
|
14254 |
+
"epoch": 0.5,
|
14255 |
+
"grad_norm": 8.029982566833496,
|
14256 |
+
"learning_rate": 6.840677966101695e-06,
|
14257 |
+
"loss": 1.2631,
|
14258 |
+
"step": 19910
|
14259 |
+
},
|
14260 |
+
{
|
14261 |
+
"epoch": 0.5,
|
14262 |
+
"grad_norm": 5.408463954925537,
|
14263 |
+
"learning_rate": 6.8338983050847465e-06,
|
14264 |
+
"loss": 1.3545,
|
14265 |
+
"step": 19920
|
14266 |
+
},
|
14267 |
+
{
|
14268 |
+
"epoch": 0.5,
|
14269 |
+
"grad_norm": 4.480403900146484,
|
14270 |
+
"learning_rate": 6.827118644067797e-06,
|
14271 |
+
"loss": 1.4549,
|
14272 |
+
"step": 19930
|
14273 |
+
},
|
14274 |
+
{
|
14275 |
+
"epoch": 0.5,
|
14276 |
+
"grad_norm": 8.981225967407227,
|
14277 |
+
"learning_rate": 6.8203389830508485e-06,
|
14278 |
+
"loss": 1.3352,
|
14279 |
+
"step": 19940
|
14280 |
+
},
|
14281 |
+
{
|
14282 |
+
"epoch": 0.5,
|
14283 |
+
"grad_norm": 1.9335066080093384,
|
14284 |
+
"learning_rate": 6.813559322033899e-06,
|
14285 |
+
"loss": 1.2825,
|
14286 |
+
"step": 19950
|
14287 |
+
},
|
14288 |
+
{
|
14289 |
+
"epoch": 0.5,
|
14290 |
+
"grad_norm": 7.6206464767456055,
|
14291 |
+
"learning_rate": 6.80677966101695e-06,
|
14292 |
+
"loss": 1.4976,
|
14293 |
+
"step": 19960
|
14294 |
+
},
|
14295 |
+
{
|
14296 |
+
"epoch": 0.5,
|
14297 |
+
"grad_norm": 14.687816619873047,
|
14298 |
+
"learning_rate": 6.800000000000001e-06,
|
14299 |
+
"loss": 1.4687,
|
14300 |
+
"step": 19970
|
14301 |
+
},
|
14302 |
+
{
|
14303 |
+
"epoch": 0.5,
|
14304 |
+
"grad_norm": 9.034219741821289,
|
14305 |
+
"learning_rate": 6.793220338983051e-06,
|
14306 |
+
"loss": 1.4715,
|
14307 |
+
"step": 19980
|
14308 |
+
},
|
14309 |
+
{
|
14310 |
+
"epoch": 0.5,
|
14311 |
+
"grad_norm": 8.120539665222168,
|
14312 |
+
"learning_rate": 6.786440677966102e-06,
|
14313 |
+
"loss": 1.4331,
|
14314 |
+
"step": 19990
|
14315 |
+
},
|
14316 |
+
{
|
14317 |
+
"epoch": 0.5,
|
14318 |
+
"grad_norm": 6.594362258911133,
|
14319 |
+
"learning_rate": 6.779661016949153e-06,
|
14320 |
+
"loss": 1.4516,
|
14321 |
+
"step": 20000
|
14322 |
+
},
|
14323 |
+
{
|
14324 |
+
"epoch": 0.5,
|
14325 |
+
"eval_loss": 1.348677158355713,
|
14326 |
+
"eval_runtime": 66.1188,
|
14327 |
+
"eval_samples_per_second": 15.124,
|
14328 |
+
"eval_steps_per_second": 15.124,
|
14329 |
+
"step": 20000
|
14330 |
}
|
14331 |
],
|
14332 |
"logging_steps": 10,
|
|
|
14334 |
"num_input_tokens_seen": 0,
|
14335 |
"num_train_epochs": 1,
|
14336 |
"save_steps": 2500,
|
14337 |
+
"total_flos": 3.2204251987968e+17,
|
14338 |
"train_batch_size": 1,
|
14339 |
"trial_name": null,
|
14340 |
"trial_params": null
|