Training in progress, step 96000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 487156538
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6d0fcc97e8f38ef4f177931a0b80e81f26fc2c3df44b1a834117199e9947a28
|
| 3 |
size 487156538
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1059459406
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22ff6e0e3931ffc4543840546a265cb104c9f8d044f751f1d1689af1785ac414
|
| 3 |
size 1059459406
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f16967f2c5425e2d3062a5c645e3099ddca9226921c4faea80b6f226c345b14e
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a57ad377b3c1d54cf00ec0f4a45ee8dcdfa0afda14d68b22cde2fefed3a794b
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db33e02267588a29827a8a403fef0f365d2173a37eee0f81da95484780955b80
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34b2c97eeef7042413901b15e9528f4663411f959a4a8a8086a8807b96ca132e
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bee3f55096050a1b77c497eef45ddf3e44fe16bc128ff4c2f549ae26e06537c
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -33258,6 +33258,356 @@
|
|
| 33258 |
"learning_rate": 0.00047666377903599896,
|
| 33259 |
"loss": 16.2668,
|
| 33260 |
"step": 95000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33261 |
}
|
| 33262 |
],
|
| 33263 |
"logging_steps": 20,
|
|
@@ -33277,7 +33627,7 @@
|
|
| 33277 |
"attributes": {}
|
| 33278 |
}
|
| 33279 |
},
|
| 33280 |
-
"total_flos": 2.
|
| 33281 |
"train_batch_size": 48,
|
| 33282 |
"trial_name": null,
|
| 33283 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.14220621085625915,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 96000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 33258 |
"learning_rate": 0.00047666377903599896,
|
| 33259 |
"loss": 16.2668,
|
| 33260 |
"step": 95000
|
| 33261 |
+
},
|
| 33262 |
+
{
|
| 33263 |
+
"epoch": 0.14075452245376815,
|
| 33264 |
+
"grad_norm": 8.0,
|
| 33265 |
+
"learning_rate": 0.00047665884010099135,
|
| 33266 |
+
"loss": 16.2326,
|
| 33267 |
+
"step": 95020
|
| 33268 |
+
},
|
| 33269 |
+
{
|
| 33270 |
+
"epoch": 0.14078414874769657,
|
| 33271 |
+
"grad_norm": 8.0625,
|
| 33272 |
+
"learning_rate": 0.0004766539011659838,
|
| 33273 |
+
"loss": 16.2558,
|
| 33274 |
+
"step": 95040
|
| 33275 |
+
},
|
| 33276 |
+
{
|
| 33277 |
+
"epoch": 0.14081377504162496,
|
| 33278 |
+
"grad_norm": 9.9375,
|
| 33279 |
+
"learning_rate": 0.0004766489622309762,
|
| 33280 |
+
"loss": 16.2363,
|
| 33281 |
+
"step": 95060
|
| 33282 |
+
},
|
| 33283 |
+
{
|
| 33284 |
+
"epoch": 0.14084340133555334,
|
| 33285 |
+
"grad_norm": 7.65625,
|
| 33286 |
+
"learning_rate": 0.0004766440232959687,
|
| 33287 |
+
"loss": 16.2672,
|
| 33288 |
+
"step": 95080
|
| 33289 |
+
},
|
| 33290 |
+
{
|
| 33291 |
+
"epoch": 0.14087302762948173,
|
| 33292 |
+
"grad_norm": 6.84375,
|
| 33293 |
+
"learning_rate": 0.0004766390843609611,
|
| 33294 |
+
"loss": 16.3032,
|
| 33295 |
+
"step": 95100
|
| 33296 |
+
},
|
| 33297 |
+
{
|
| 33298 |
+
"epoch": 0.14090265392341011,
|
| 33299 |
+
"grad_norm": 7.5625,
|
| 33300 |
+
"learning_rate": 0.00047663414542595354,
|
| 33301 |
+
"loss": 16.1972,
|
| 33302 |
+
"step": 95120
|
| 33303 |
+
},
|
| 33304 |
+
{
|
| 33305 |
+
"epoch": 0.1409322802173385,
|
| 33306 |
+
"grad_norm": 7.96875,
|
| 33307 |
+
"learning_rate": 0.00047662920649094593,
|
| 33308 |
+
"loss": 16.2712,
|
| 33309 |
+
"step": 95140
|
| 33310 |
+
},
|
| 33311 |
+
{
|
| 33312 |
+
"epoch": 0.1409619065112669,
|
| 33313 |
+
"grad_norm": 9.8125,
|
| 33314 |
+
"learning_rate": 0.00047662426755593843,
|
| 33315 |
+
"loss": 16.2999,
|
| 33316 |
+
"step": 95160
|
| 33317 |
+
},
|
| 33318 |
+
{
|
| 33319 |
+
"epoch": 0.14099153280519527,
|
| 33320 |
+
"grad_norm": 7.71875,
|
| 33321 |
+
"learning_rate": 0.0004766193286209308,
|
| 33322 |
+
"loss": 16.2646,
|
| 33323 |
+
"step": 95180
|
| 33324 |
+
},
|
| 33325 |
+
{
|
| 33326 |
+
"epoch": 0.14102115909912366,
|
| 33327 |
+
"grad_norm": 7.5,
|
| 33328 |
+
"learning_rate": 0.0004766143896859232,
|
| 33329 |
+
"loss": 16.2533,
|
| 33330 |
+
"step": 95200
|
| 33331 |
+
},
|
| 33332 |
+
{
|
| 33333 |
+
"epoch": 0.14105078539305205,
|
| 33334 |
+
"grad_norm": 8.4375,
|
| 33335 |
+
"learning_rate": 0.00047660945075091567,
|
| 33336 |
+
"loss": 16.2281,
|
| 33337 |
+
"step": 95220
|
| 33338 |
+
},
|
| 33339 |
+
{
|
| 33340 |
+
"epoch": 0.14108041168698043,
|
| 33341 |
+
"grad_norm": 6.65625,
|
| 33342 |
+
"learning_rate": 0.0004766045118159081,
|
| 33343 |
+
"loss": 16.2489,
|
| 33344 |
+
"step": 95240
|
| 33345 |
+
},
|
| 33346 |
+
{
|
| 33347 |
+
"epoch": 0.14111003798090882,
|
| 33348 |
+
"grad_norm": 8.3125,
|
| 33349 |
+
"learning_rate": 0.00047659957288090056,
|
| 33350 |
+
"loss": 16.2813,
|
| 33351 |
+
"step": 95260
|
| 33352 |
+
},
|
| 33353 |
+
{
|
| 33354 |
+
"epoch": 0.1411396642748372,
|
| 33355 |
+
"grad_norm": 9.0,
|
| 33356 |
+
"learning_rate": 0.00047659463394589296,
|
| 33357 |
+
"loss": 16.2566,
|
| 33358 |
+
"step": 95280
|
| 33359 |
+
},
|
| 33360 |
+
{
|
| 33361 |
+
"epoch": 0.1411692905687656,
|
| 33362 |
+
"grad_norm": 8.75,
|
| 33363 |
+
"learning_rate": 0.00047658969501088546,
|
| 33364 |
+
"loss": 16.2978,
|
| 33365 |
+
"step": 95300
|
| 33366 |
+
},
|
| 33367 |
+
{
|
| 33368 |
+
"epoch": 0.14119891686269398,
|
| 33369 |
+
"grad_norm": 7.34375,
|
| 33370 |
+
"learning_rate": 0.00047658475607587785,
|
| 33371 |
+
"loss": 16.2168,
|
| 33372 |
+
"step": 95320
|
| 33373 |
+
},
|
| 33374 |
+
{
|
| 33375 |
+
"epoch": 0.14122854315662237,
|
| 33376 |
+
"grad_norm": 7.8125,
|
| 33377 |
+
"learning_rate": 0.0004765798171408703,
|
| 33378 |
+
"loss": 16.2683,
|
| 33379 |
+
"step": 95340
|
| 33380 |
+
},
|
| 33381 |
+
{
|
| 33382 |
+
"epoch": 0.14125816945055075,
|
| 33383 |
+
"grad_norm": 11.0,
|
| 33384 |
+
"learning_rate": 0.0004765748782058627,
|
| 33385 |
+
"loss": 16.3143,
|
| 33386 |
+
"step": 95360
|
| 33387 |
+
},
|
| 33388 |
+
{
|
| 33389 |
+
"epoch": 0.14128779574447914,
|
| 33390 |
+
"grad_norm": 8.0,
|
| 33391 |
+
"learning_rate": 0.0004765699392708552,
|
| 33392 |
+
"loss": 16.2963,
|
| 33393 |
+
"step": 95380
|
| 33394 |
+
},
|
| 33395 |
+
{
|
| 33396 |
+
"epoch": 0.14131742203840753,
|
| 33397 |
+
"grad_norm": 7.96875,
|
| 33398 |
+
"learning_rate": 0.0004765650003358476,
|
| 33399 |
+
"loss": 16.2477,
|
| 33400 |
+
"step": 95400
|
| 33401 |
+
},
|
| 33402 |
+
{
|
| 33403 |
+
"epoch": 0.1413470483323359,
|
| 33404 |
+
"grad_norm": 8.8125,
|
| 33405 |
+
"learning_rate": 0.00047656006140084004,
|
| 33406 |
+
"loss": 16.2887,
|
| 33407 |
+
"step": 95420
|
| 33408 |
+
},
|
| 33409 |
+
{
|
| 33410 |
+
"epoch": 0.1413766746262643,
|
| 33411 |
+
"grad_norm": 6.875,
|
| 33412 |
+
"learning_rate": 0.00047655512246583243,
|
| 33413 |
+
"loss": 16.3026,
|
| 33414 |
+
"step": 95440
|
| 33415 |
+
},
|
| 33416 |
+
{
|
| 33417 |
+
"epoch": 0.14140630092019268,
|
| 33418 |
+
"grad_norm": 7.28125,
|
| 33419 |
+
"learning_rate": 0.00047655018353082493,
|
| 33420 |
+
"loss": 16.2857,
|
| 33421 |
+
"step": 95460
|
| 33422 |
+
},
|
| 33423 |
+
{
|
| 33424 |
+
"epoch": 0.14143592721412107,
|
| 33425 |
+
"grad_norm": 8.75,
|
| 33426 |
+
"learning_rate": 0.0004765452445958173,
|
| 33427 |
+
"loss": 16.2834,
|
| 33428 |
+
"step": 95480
|
| 33429 |
+
},
|
| 33430 |
+
{
|
| 33431 |
+
"epoch": 0.14146555350804946,
|
| 33432 |
+
"grad_norm": 9.3125,
|
| 33433 |
+
"learning_rate": 0.0004765403056608098,
|
| 33434 |
+
"loss": 16.1975,
|
| 33435 |
+
"step": 95500
|
| 33436 |
+
},
|
| 33437 |
+
{
|
| 33438 |
+
"epoch": 0.14149517980197784,
|
| 33439 |
+
"grad_norm": 8.5625,
|
| 33440 |
+
"learning_rate": 0.00047653536672580217,
|
| 33441 |
+
"loss": 16.2638,
|
| 33442 |
+
"step": 95520
|
| 33443 |
+
},
|
| 33444 |
+
{
|
| 33445 |
+
"epoch": 0.14152480609590623,
|
| 33446 |
+
"grad_norm": 7.53125,
|
| 33447 |
+
"learning_rate": 0.0004765304277907946,
|
| 33448 |
+
"loss": 16.209,
|
| 33449 |
+
"step": 95540
|
| 33450 |
+
},
|
| 33451 |
+
{
|
| 33452 |
+
"epoch": 0.14155443238983462,
|
| 33453 |
+
"grad_norm": 7.09375,
|
| 33454 |
+
"learning_rate": 0.00047652548885578706,
|
| 33455 |
+
"loss": 16.2836,
|
| 33456 |
+
"step": 95560
|
| 33457 |
+
},
|
| 33458 |
+
{
|
| 33459 |
+
"epoch": 0.141584058683763,
|
| 33460 |
+
"grad_norm": 9.375,
|
| 33461 |
+
"learning_rate": 0.00047652054992077946,
|
| 33462 |
+
"loss": 16.2385,
|
| 33463 |
+
"step": 95580
|
| 33464 |
+
},
|
| 33465 |
+
{
|
| 33466 |
+
"epoch": 0.1416136849776914,
|
| 33467 |
+
"grad_norm": 6.15625,
|
| 33468 |
+
"learning_rate": 0.00047651561098577196,
|
| 33469 |
+
"loss": 16.3201,
|
| 33470 |
+
"step": 95600
|
| 33471 |
+
},
|
| 33472 |
+
{
|
| 33473 |
+
"epoch": 0.14164331127161978,
|
| 33474 |
+
"grad_norm": 7.09375,
|
| 33475 |
+
"learning_rate": 0.00047651067205076435,
|
| 33476 |
+
"loss": 16.2798,
|
| 33477 |
+
"step": 95620
|
| 33478 |
+
},
|
| 33479 |
+
{
|
| 33480 |
+
"epoch": 0.14167293756554816,
|
| 33481 |
+
"grad_norm": 8.3125,
|
| 33482 |
+
"learning_rate": 0.0004765057331157568,
|
| 33483 |
+
"loss": 16.3551,
|
| 33484 |
+
"step": 95640
|
| 33485 |
+
},
|
| 33486 |
+
{
|
| 33487 |
+
"epoch": 0.14170256385947655,
|
| 33488 |
+
"grad_norm": 6.25,
|
| 33489 |
+
"learning_rate": 0.0004765007941807492,
|
| 33490 |
+
"loss": 16.244,
|
| 33491 |
+
"step": 95660
|
| 33492 |
+
},
|
| 33493 |
+
{
|
| 33494 |
+
"epoch": 0.14173219015340496,
|
| 33495 |
+
"grad_norm": 9.375,
|
| 33496 |
+
"learning_rate": 0.0004764958552457417,
|
| 33497 |
+
"loss": 16.2651,
|
| 33498 |
+
"step": 95680
|
| 33499 |
+
},
|
| 33500 |
+
{
|
| 33501 |
+
"epoch": 0.14176181644733335,
|
| 33502 |
+
"grad_norm": 14.3125,
|
| 33503 |
+
"learning_rate": 0.0004764909163107341,
|
| 33504 |
+
"loss": 16.2386,
|
| 33505 |
+
"step": 95700
|
| 33506 |
+
},
|
| 33507 |
+
{
|
| 33508 |
+
"epoch": 0.14179144274126174,
|
| 33509 |
+
"grad_norm": 7.15625,
|
| 33510 |
+
"learning_rate": 0.00047648597737572654,
|
| 33511 |
+
"loss": 16.2521,
|
| 33512 |
+
"step": 95720
|
| 33513 |
+
},
|
| 33514 |
+
{
|
| 33515 |
+
"epoch": 0.14182106903519012,
|
| 33516 |
+
"grad_norm": 7.5,
|
| 33517 |
+
"learning_rate": 0.00047648103844071893,
|
| 33518 |
+
"loss": 16.2774,
|
| 33519 |
+
"step": 95740
|
| 33520 |
+
},
|
| 33521 |
+
{
|
| 33522 |
+
"epoch": 0.1418506953291185,
|
| 33523 |
+
"grad_norm": 6.9375,
|
| 33524 |
+
"learning_rate": 0.00047647609950571143,
|
| 33525 |
+
"loss": 16.2116,
|
| 33526 |
+
"step": 95760
|
| 33527 |
+
},
|
| 33528 |
+
{
|
| 33529 |
+
"epoch": 0.1418803216230469,
|
| 33530 |
+
"grad_norm": 6.125,
|
| 33531 |
+
"learning_rate": 0.0004764711605707038,
|
| 33532 |
+
"loss": 16.2365,
|
| 33533 |
+
"step": 95780
|
| 33534 |
+
},
|
| 33535 |
+
{
|
| 33536 |
+
"epoch": 0.14190994791697528,
|
| 33537 |
+
"grad_norm": 8.8125,
|
| 33538 |
+
"learning_rate": 0.0004764662216356963,
|
| 33539 |
+
"loss": 16.267,
|
| 33540 |
+
"step": 95800
|
| 33541 |
+
},
|
| 33542 |
+
{
|
| 33543 |
+
"epoch": 0.14193957421090367,
|
| 33544 |
+
"grad_norm": 7.8125,
|
| 33545 |
+
"learning_rate": 0.00047646128270068867,
|
| 33546 |
+
"loss": 16.382,
|
| 33547 |
+
"step": 95820
|
| 33548 |
+
},
|
| 33549 |
+
{
|
| 33550 |
+
"epoch": 0.14196920050483205,
|
| 33551 |
+
"grad_norm": 10.5,
|
| 33552 |
+
"learning_rate": 0.00047645634376568117,
|
| 33553 |
+
"loss": 16.2225,
|
| 33554 |
+
"step": 95840
|
| 33555 |
+
},
|
| 33556 |
+
{
|
| 33557 |
+
"epoch": 0.14199882679876044,
|
| 33558 |
+
"grad_norm": 6.9375,
|
| 33559 |
+
"learning_rate": 0.00047645140483067356,
|
| 33560 |
+
"loss": 16.2772,
|
| 33561 |
+
"step": 95860
|
| 33562 |
+
},
|
| 33563 |
+
{
|
| 33564 |
+
"epoch": 0.14202845309268883,
|
| 33565 |
+
"grad_norm": 6.59375,
|
| 33566 |
+
"learning_rate": 0.00047644646589566596,
|
| 33567 |
+
"loss": 16.2962,
|
| 33568 |
+
"step": 95880
|
| 33569 |
+
},
|
| 33570 |
+
{
|
| 33571 |
+
"epoch": 0.14205807938661721,
|
| 33572 |
+
"grad_norm": 11.8125,
|
| 33573 |
+
"learning_rate": 0.00047644152696065846,
|
| 33574 |
+
"loss": 16.3097,
|
| 33575 |
+
"step": 95900
|
| 33576 |
+
},
|
| 33577 |
+
{
|
| 33578 |
+
"epoch": 0.1420877056805456,
|
| 33579 |
+
"grad_norm": 7.0625,
|
| 33580 |
+
"learning_rate": 0.00047643658802565085,
|
| 33581 |
+
"loss": 16.2627,
|
| 33582 |
+
"step": 95920
|
| 33583 |
+
},
|
| 33584 |
+
{
|
| 33585 |
+
"epoch": 0.142117331974474,
|
| 33586 |
+
"grad_norm": 7.75,
|
| 33587 |
+
"learning_rate": 0.0004764316490906433,
|
| 33588 |
+
"loss": 16.2114,
|
| 33589 |
+
"step": 95940
|
| 33590 |
+
},
|
| 33591 |
+
{
|
| 33592 |
+
"epoch": 0.14214695826840237,
|
| 33593 |
+
"grad_norm": 20.25,
|
| 33594 |
+
"learning_rate": 0.0004764267101556357,
|
| 33595 |
+
"loss": 16.1394,
|
| 33596 |
+
"step": 95960
|
| 33597 |
+
},
|
| 33598 |
+
{
|
| 33599 |
+
"epoch": 0.14217658456233076,
|
| 33600 |
+
"grad_norm": 6.84375,
|
| 33601 |
+
"learning_rate": 0.0004764217712206282,
|
| 33602 |
+
"loss": 16.261,
|
| 33603 |
+
"step": 95980
|
| 33604 |
+
},
|
| 33605 |
+
{
|
| 33606 |
+
"epoch": 0.14220621085625915,
|
| 33607 |
+
"grad_norm": 7.34375,
|
| 33608 |
+
"learning_rate": 0.0004764168322856206,
|
| 33609 |
+
"loss": 16.226,
|
| 33610 |
+
"step": 96000
|
| 33611 |
}
|
| 33612 |
],
|
| 33613 |
"logging_steps": 20,
|
|
|
|
| 33627 |
"attributes": {}
|
| 33628 |
}
|
| 33629 |
},
|
| 33630 |
+
"total_flos": 2.1365408984116442e+20,
|
| 33631 |
"train_batch_size": 48,
|
| 33632 |
"trial_name": null,
|
| 33633 |
"trial_params": null
|