diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24440 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999713113578335, + "eval_steps": 500, + "global_step": 17428, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.38671875, + "learning_rate": 1.1474469305794606e-07, + "loss": 1.3368, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 5.737234652897304e-07, + "loss": 1.0791, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.31640625, + "learning_rate": 1.1474469305794607e-06, + "loss": 1.1069, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 1.721170395869191e-06, + "loss": 1.1566, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.384765625, + "learning_rate": 2.2948938611589215e-06, + "loss": 1.1799, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.33203125, + "learning_rate": 2.868617326448652e-06, + "loss": 1.111, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 3.442340791738382e-06, + "loss": 1.0485, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 4.016064257028113e-06, + "loss": 1.1027, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.3125, + "learning_rate": 4.589787722317843e-06, + "loss": 1.1293, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.34375, + "learning_rate": 5.163511187607573e-06, + "loss": 1.0885, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.3203125, + "learning_rate": 5.737234652897304e-06, + "loss": 1.1255, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.33203125, + "learning_rate": 6.310958118187034e-06, + "loss": 1.1155, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 0.298828125, + "learning_rate": 6.884681583476764e-06, + "loss": 1.0925, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 7.4584050487664955e-06, + "loss": 1.1294, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 8.032128514056226e-06, + "loss": 1.1594, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 8.605851979345956e-06, + "loss": 1.0828, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 9.179575444635686e-06, + "loss": 1.1212, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 0.2470703125, + "learning_rate": 9.753298909925416e-06, + "loss": 1.0751, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.271484375, + "learning_rate": 1.0327022375215146e-05, + "loss": 1.0247, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.26171875, + "learning_rate": 1.0900745840504876e-05, + "loss": 1.0963, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.271484375, + "learning_rate": 1.1474469305794608e-05, + "loss": 1.0563, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.345703125, + "learning_rate": 1.2048192771084338e-05, + "loss": 1.1441, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.447265625, + "learning_rate": 1.2621916236374069e-05, + "loss": 0.9927, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.2109375, + "learning_rate": 1.3195639701663797e-05, + "loss": 0.9616, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.2373046875, + "learning_rate": 1.3769363166953527e-05, + "loss": 0.9632, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.2421875, + "learning_rate": 1.434308663224326e-05, + "loss": 1.0317, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.275390625, + "learning_rate": 1.4916810097532991e-05, + "loss": 1.0095, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.224609375, + "learning_rate": 1.549053356282272e-05, + "loss": 0.9795, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.2421875, + "learning_rate": 1.606425702811245e-05, + "loss": 0.9755, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.23828125, + "learning_rate": 1.663798049340218e-05, + "loss": 0.978, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.26953125, + "learning_rate": 1.721170395869191e-05, + "loss": 0.9761, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.2470703125, + "learning_rate": 1.7785427423981642e-05, + "loss": 0.9764, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.4375, + "learning_rate": 1.8359150889271372e-05, + "loss": 0.9912, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 0.275390625, + "learning_rate": 1.8932874354561102e-05, + "loss": 1.0147, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 0.267578125, + "learning_rate": 1.9506597819850832e-05, + "loss": 1.0047, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.25390625, + "learning_rate": 2.0080321285140562e-05, + "loss": 0.9577, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.2333984375, + "learning_rate": 2.0654044750430293e-05, + "loss": 1.0258, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 0.451171875, + "learning_rate": 2.1227768215720023e-05, + "loss": 1.0307, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 0.27734375, + "learning_rate": 2.1801491681009753e-05, + "loss": 1.0406, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 0.25390625, + "learning_rate": 2.2375215146299486e-05, + "loss": 0.9139, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 0.26171875, + "learning_rate": 2.2948938611589217e-05, + "loss": 1.0682, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 0.322265625, + "learning_rate": 2.3522662076878947e-05, + "loss": 1.0834, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 0.267578125, + "learning_rate": 2.4096385542168677e-05, + "loss": 1.0576, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 0.25390625, + "learning_rate": 2.4670109007458407e-05, + "loss": 1.0109, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 0.279296875, + "learning_rate": 2.5243832472748137e-05, + "loss": 0.9636, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 0.259765625, + "learning_rate": 2.5817555938037867e-05, + "loss": 1.016, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 0.283203125, + "learning_rate": 2.6391279403327594e-05, + "loss": 1.0972, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 0.26953125, + "learning_rate": 2.6965002868617328e-05, + "loss": 0.9953, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 0.283203125, + "learning_rate": 2.7538726333907055e-05, + "loss": 1.0365, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 0.275390625, + "learning_rate": 2.8112449799196788e-05, + "loss": 1.0454, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 0.310546875, + "learning_rate": 2.868617326448652e-05, + "loss": 1.0073, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 0.26953125, + "learning_rate": 2.925989672977625e-05, + "loss": 0.9828, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 0.34765625, + "learning_rate": 2.9833620195065982e-05, + "loss": 0.9571, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 0.31640625, + "learning_rate": 3.040734366035571e-05, + "loss": 1.0254, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 0.283203125, + "learning_rate": 3.098106712564544e-05, + "loss": 0.9365, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 0.2734375, + "learning_rate": 3.155479059093517e-05, + "loss": 0.9507, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 0.296875, + "learning_rate": 3.21285140562249e-05, + "loss": 0.9689, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 0.27734375, + "learning_rate": 3.2702237521514636e-05, + "loss": 1.0112, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 0.29296875, + "learning_rate": 3.327596098680436e-05, + "loss": 1.0417, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.27734375, + "learning_rate": 3.3849684452094096e-05, + "loss": 1.0556, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 0.28515625, + "learning_rate": 3.442340791738382e-05, + "loss": 0.9538, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 0.27734375, + "learning_rate": 3.499713138267356e-05, + "loss": 0.9891, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 0.267578125, + "learning_rate": 3.5570854847963284e-05, + "loss": 0.9103, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 0.267578125, + "learning_rate": 3.614457831325301e-05, + "loss": 1.0865, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 0.294921875, + "learning_rate": 3.6718301778542744e-05, + "loss": 0.9835, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 0.28515625, + "learning_rate": 3.729202524383247e-05, + "loss": 1.0064, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 0.30078125, + "learning_rate": 3.7865748709122204e-05, + "loss": 1.0383, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 0.27734375, + "learning_rate": 3.843947217441193e-05, + "loss": 1.0254, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 0.2890625, + "learning_rate": 3.9013195639701665e-05, + "loss": 1.0076, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 0.263671875, + "learning_rate": 3.958691910499139e-05, + "loss": 1.0027, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 0.2890625, + "learning_rate": 4.0160642570281125e-05, + "loss": 1.0093, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 0.283203125, + "learning_rate": 4.073436603557086e-05, + "loss": 1.0416, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 0.2890625, + "learning_rate": 4.1308089500860585e-05, + "loss": 1.0127, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 0.271484375, + "learning_rate": 4.188181296615032e-05, + "loss": 1.0044, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 0.275390625, + "learning_rate": 4.2455536431440046e-05, + "loss": 0.9113, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 0.26171875, + "learning_rate": 4.302925989672978e-05, + "loss": 0.9618, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 0.26953125, + "learning_rate": 4.3602983362019506e-05, + "loss": 0.9842, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 0.318359375, + "learning_rate": 4.417670682730924e-05, + "loss": 0.9779, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 0.29296875, + "learning_rate": 4.475043029259897e-05, + "loss": 0.9694, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 0.265625, + "learning_rate": 4.53241537578887e-05, + "loss": 1.0035, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 0.259765625, + "learning_rate": 4.589787722317843e-05, + "loss": 0.9444, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 0.26953125, + "learning_rate": 4.647160068846816e-05, + "loss": 0.9722, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 0.283203125, + "learning_rate": 4.7045324153757894e-05, + "loss": 0.9614, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 0.26953125, + "learning_rate": 4.761904761904762e-05, + "loss": 1.0325, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 0.279296875, + "learning_rate": 4.8192771084337354e-05, + "loss": 0.9545, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 0.263671875, + "learning_rate": 4.876649454962709e-05, + "loss": 0.9912, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 0.29296875, + "learning_rate": 4.9340218014916814e-05, + "loss": 1.0194, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 0.263671875, + "learning_rate": 4.991394148020654e-05, + "loss": 1.048, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 0.240234375, + "learning_rate": 5.0487664945496275e-05, + "loss": 0.936, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 0.2421875, + "learning_rate": 5.1061388410786e-05, + "loss": 1.0078, + "step": 445 + }, + { + "epoch": 0.03, + "grad_norm": 0.263671875, + "learning_rate": 5.1635111876075735e-05, + "loss": 0.9624, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 0.26171875, + "learning_rate": 5.220883534136547e-05, + "loss": 0.977, + "step": 455 + }, + { + "epoch": 0.03, + "grad_norm": 0.271484375, + "learning_rate": 5.278255880665519e-05, + "loss": 1.0434, + "step": 460 + }, + { + "epoch": 0.03, + "grad_norm": 0.26171875, + "learning_rate": 5.335628227194492e-05, + "loss": 0.9327, + "step": 465 + }, + { + "epoch": 0.03, + "grad_norm": 0.26171875, + "learning_rate": 5.3930005737234656e-05, + "loss": 1.0557, + "step": 470 + }, + { + "epoch": 0.03, + "grad_norm": 0.27734375, + "learning_rate": 5.450372920252439e-05, + "loss": 0.9911, + "step": 475 + }, + { + "epoch": 0.03, + "grad_norm": 0.26171875, + "learning_rate": 5.507745266781411e-05, + "loss": 0.9561, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 0.279296875, + "learning_rate": 5.565117613310384e-05, + "loss": 1.0194, + "step": 485 + }, + { + "epoch": 0.03, + "grad_norm": 0.279296875, + "learning_rate": 5.6224899598393576e-05, + "loss": 0.9214, + "step": 490 + }, + { + "epoch": 0.03, + "grad_norm": 0.283203125, + "learning_rate": 5.679862306368331e-05, + "loss": 0.9457, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 0.287109375, + "learning_rate": 5.737234652897304e-05, + "loss": 0.9919, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 0.298828125, + "learning_rate": 5.794606999426276e-05, + "loss": 0.9558, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 0.2578125, + "learning_rate": 5.85197934595525e-05, + "loss": 0.9813, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 0.27734375, + "learning_rate": 5.909351692484223e-05, + "loss": 1.0898, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 0.302734375, + "learning_rate": 5.9667240390131964e-05, + "loss": 1.0559, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 0.25, + "learning_rate": 6.02409638554217e-05, + "loss": 1.0005, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 0.267578125, + "learning_rate": 6.081468732071142e-05, + "loss": 0.9529, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 0.244140625, + "learning_rate": 6.138841078600115e-05, + "loss": 0.9576, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 0.2470703125, + "learning_rate": 6.196213425129088e-05, + "loss": 0.9927, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 0.251953125, + "learning_rate": 6.253585771658062e-05, + "loss": 0.9369, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 0.265625, + "learning_rate": 6.310958118187034e-05, + "loss": 1.0384, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 0.2451171875, + "learning_rate": 6.368330464716007e-05, + "loss": 0.9414, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 0.265625, + "learning_rate": 6.42570281124498e-05, + "loss": 0.9954, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 0.267578125, + "learning_rate": 6.483075157773954e-05, + "loss": 0.9833, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 0.2373046875, + "learning_rate": 6.540447504302927e-05, + "loss": 1.0212, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 0.25, + "learning_rate": 6.597819850831899e-05, + "loss": 1.0068, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 0.26171875, + "learning_rate": 6.655192197360873e-05, + "loss": 0.9296, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 0.2421875, + "learning_rate": 6.712564543889846e-05, + "loss": 1.0144, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 0.2490234375, + "learning_rate": 6.769936890418819e-05, + "loss": 1.0089, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 0.248046875, + "learning_rate": 6.827309236947793e-05, + "loss": 1.0388, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 0.25390625, + "learning_rate": 6.884681583476765e-05, + "loss": 1.055, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 0.2470703125, + "learning_rate": 6.942053930005738e-05, + "loss": 0.9395, + "step": 605 + }, + { + "epoch": 0.04, + "grad_norm": 0.2470703125, + "learning_rate": 6.999426276534711e-05, + "loss": 0.9684, + "step": 610 + }, + { + "epoch": 0.04, + "grad_norm": 0.2353515625, + "learning_rate": 7.056798623063683e-05, + "loss": 0.9351, + "step": 615 + }, + { + "epoch": 0.04, + "grad_norm": 0.26171875, + "learning_rate": 7.114170969592657e-05, + "loss": 1.0539, + "step": 620 + }, + { + "epoch": 0.04, + "grad_norm": 0.234375, + "learning_rate": 7.17154331612163e-05, + "loss": 1.0311, + "step": 625 + }, + { + "epoch": 0.04, + "grad_norm": 0.2451171875, + "learning_rate": 7.228915662650602e-05, + "loss": 0.9639, + "step": 630 + }, + { + "epoch": 0.04, + "grad_norm": 0.2470703125, + "learning_rate": 7.286288009179575e-05, + "loss": 0.9518, + "step": 635 + }, + { + "epoch": 0.04, + "grad_norm": 0.2578125, + "learning_rate": 7.343660355708549e-05, + "loss": 0.9754, + "step": 640 + }, + { + "epoch": 0.04, + "grad_norm": 0.25, + "learning_rate": 7.401032702237521e-05, + "loss": 0.9694, + "step": 645 + }, + { + "epoch": 0.04, + "grad_norm": 0.2451171875, + "learning_rate": 7.458405048766494e-05, + "loss": 0.9101, + "step": 650 + }, + { + "epoch": 0.04, + "grad_norm": 0.2431640625, + "learning_rate": 7.515777395295467e-05, + "loss": 1.0275, + "step": 655 + }, + { + "epoch": 0.04, + "grad_norm": 0.2490234375, + "learning_rate": 7.573149741824441e-05, + "loss": 0.9585, + "step": 660 + }, + { + "epoch": 0.04, + "grad_norm": 0.25, + "learning_rate": 7.630522088353414e-05, + "loss": 0.9789, + "step": 665 + }, + { + "epoch": 0.04, + "grad_norm": 0.23828125, + "learning_rate": 7.687894434882386e-05, + "loss": 0.8663, + "step": 670 + }, + { + "epoch": 0.04, + "grad_norm": 0.2431640625, + "learning_rate": 7.74526678141136e-05, + "loss": 1.0568, + "step": 675 + }, + { + "epoch": 0.04, + "grad_norm": 0.2392578125, + "learning_rate": 7.802639127940333e-05, + "loss": 1.0539, + "step": 680 + }, + { + "epoch": 0.04, + "grad_norm": 0.255859375, + "learning_rate": 7.860011474469306e-05, + "loss": 0.9589, + "step": 685 + }, + { + "epoch": 0.04, + "grad_norm": 0.263671875, + "learning_rate": 7.917383820998278e-05, + "loss": 0.9545, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 0.24609375, + "learning_rate": 7.974756167527252e-05, + "loss": 0.9511, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 0.25390625, + "learning_rate": 8.032128514056225e-05, + "loss": 1.0379, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 0.232421875, + "learning_rate": 8.089500860585198e-05, + "loss": 0.9617, + "step": 705 + }, + { + "epoch": 0.04, + "grad_norm": 0.2431640625, + "learning_rate": 8.146873207114172e-05, + "loss": 0.9202, + "step": 710 + }, + { + "epoch": 0.04, + "grad_norm": 0.2314453125, + "learning_rate": 8.204245553643144e-05, + "loss": 1.0169, + "step": 715 + }, + { + "epoch": 0.04, + "grad_norm": 0.240234375, + "learning_rate": 8.261617900172117e-05, + "loss": 0.955, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 0.26171875, + "learning_rate": 8.31899024670109e-05, + "loss": 0.9753, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 0.2265625, + "learning_rate": 8.376362593230064e-05, + "loss": 1.0299, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 0.271484375, + "learning_rate": 8.433734939759037e-05, + "loss": 1.0099, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 0.232421875, + "learning_rate": 8.491107286288009e-05, + "loss": 0.9463, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 0.228515625, + "learning_rate": 8.548479632816982e-05, + "loss": 1.0125, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 0.2333984375, + "learning_rate": 8.605851979345956e-05, + "loss": 0.9646, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 0.23046875, + "learning_rate": 8.663224325874929e-05, + "loss": 0.9458, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 0.232421875, + "learning_rate": 8.720596672403901e-05, + "loss": 0.9823, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 0.2314453125, + "learning_rate": 8.777969018932875e-05, + "loss": 0.9312, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 0.232421875, + "learning_rate": 8.835341365461848e-05, + "loss": 1.0313, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 0.2216796875, + "learning_rate": 8.892713711990821e-05, + "loss": 0.9286, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 0.251953125, + "learning_rate": 8.950086058519795e-05, + "loss": 0.9921, + "step": 780 + }, + { + "epoch": 0.05, + "grad_norm": 0.224609375, + "learning_rate": 9.007458405048767e-05, + "loss": 0.9907, + "step": 785 + }, + { + "epoch": 0.05, + "grad_norm": 0.228515625, + "learning_rate": 9.06483075157774e-05, + "loss": 0.9607, + "step": 790 + }, + { + "epoch": 0.05, + "grad_norm": 0.2314453125, + "learning_rate": 9.122203098106713e-05, + "loss": 1.0123, + "step": 795 + }, + { + "epoch": 0.05, + "grad_norm": 0.2216796875, + "learning_rate": 9.179575444635687e-05, + "loss": 0.9162, + "step": 800 + }, + { + "epoch": 0.05, + "grad_norm": 0.2333984375, + "learning_rate": 9.23694779116466e-05, + "loss": 0.94, + "step": 805 + }, + { + "epoch": 0.05, + "grad_norm": 0.228515625, + "learning_rate": 9.294320137693632e-05, + "loss": 0.9643, + "step": 810 + }, + { + "epoch": 0.05, + "grad_norm": 0.232421875, + "learning_rate": 9.351692484222605e-05, + "loss": 0.9354, + "step": 815 + }, + { + "epoch": 0.05, + "grad_norm": 0.2265625, + "learning_rate": 9.409064830751579e-05, + "loss": 0.9763, + "step": 820 + }, + { + "epoch": 0.05, + "grad_norm": 0.21875, + "learning_rate": 9.466437177280552e-05, + "loss": 1.0257, + "step": 825 + }, + { + "epoch": 0.05, + "grad_norm": 0.236328125, + "learning_rate": 9.523809523809524e-05, + "loss": 1.0173, + "step": 830 + }, + { + "epoch": 0.05, + "grad_norm": 0.2294921875, + "learning_rate": 9.581181870338497e-05, + "loss": 0.9775, + "step": 835 + }, + { + "epoch": 0.05, + "grad_norm": 0.2197265625, + "learning_rate": 9.638554216867471e-05, + "loss": 0.9463, + "step": 840 + }, + { + "epoch": 0.05, + "grad_norm": 0.2294921875, + "learning_rate": 9.695926563396444e-05, + "loss": 0.9372, + "step": 845 + }, + { + "epoch": 0.05, + "grad_norm": 0.244140625, + "learning_rate": 9.753298909925417e-05, + "loss": 1.0044, + "step": 850 + }, + { + "epoch": 0.05, + "grad_norm": 0.28515625, + "learning_rate": 9.81067125645439e-05, + "loss": 0.9879, + "step": 855 + }, + { + "epoch": 0.05, + "grad_norm": 0.2451171875, + "learning_rate": 9.868043602983363e-05, + "loss": 1.0205, + "step": 860 + }, + { + "epoch": 0.05, + "grad_norm": 0.2197265625, + "learning_rate": 9.925415949512336e-05, + "loss": 0.9758, + "step": 865 + }, + { + "epoch": 0.05, + "grad_norm": 0.244140625, + "learning_rate": 9.982788296041308e-05, + "loss": 0.9762, + "step": 870 + }, + { + "epoch": 0.05, + "grad_norm": 0.2236328125, + "learning_rate": 0.00010040160642570282, + "loss": 0.9892, + "step": 875 + }, + { + "epoch": 0.05, + "grad_norm": 0.2392578125, + "learning_rate": 0.00010097532989099255, + "loss": 1.0356, + "step": 880 + }, + { + "epoch": 0.05, + "grad_norm": 0.2216796875, + "learning_rate": 0.00010154905335628228, + "loss": 1.0237, + "step": 885 + }, + { + "epoch": 0.05, + "grad_norm": 0.25, + "learning_rate": 0.000102122776821572, + "loss": 1.0023, + "step": 890 + }, + { + "epoch": 0.05, + "grad_norm": 0.224609375, + "learning_rate": 0.00010269650028686174, + "loss": 0.9528, + "step": 895 + }, + { + "epoch": 0.05, + "grad_norm": 0.228515625, + "learning_rate": 0.00010327022375215147, + "loss": 1.0398, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 0.2333984375, + "learning_rate": 0.00010384394721744119, + "loss": 0.9367, + "step": 905 + }, + { + "epoch": 0.05, + "grad_norm": 0.21875, + "learning_rate": 0.00010441767068273094, + "loss": 1.0829, + "step": 910 + }, + { + "epoch": 0.05, + "grad_norm": 0.21875, + "learning_rate": 0.00010499139414802066, + "loss": 0.9457, + "step": 915 + }, + { + "epoch": 0.05, + "grad_norm": 0.2275390625, + "learning_rate": 0.00010556511761331038, + "loss": 1.0245, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 0.224609375, + "learning_rate": 0.00010613884107860012, + "loss": 0.9817, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 0.21484375, + "learning_rate": 0.00010671256454388984, + "loss": 0.9489, + "step": 930 + }, + { + "epoch": 0.05, + "grad_norm": 0.2451171875, + "learning_rate": 0.00010728628800917956, + "loss": 1.0089, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 0.2216796875, + "learning_rate": 0.00010786001147446931, + "loss": 0.9619, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 0.220703125, + "learning_rate": 0.00010843373493975903, + "loss": 0.959, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 0.21484375, + "learning_rate": 0.00010900745840504878, + "loss": 0.9085, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001095811818703385, + "loss": 0.9786, + "step": 955 + }, + { + "epoch": 0.06, + "grad_norm": 0.2392578125, + "learning_rate": 0.00011015490533562822, + "loss": 0.9537, + "step": 960 + }, + { + "epoch": 0.06, + "grad_norm": 0.220703125, + "learning_rate": 0.00011072862880091797, + "loss": 0.9376, + "step": 965 + }, + { + "epoch": 0.06, + "grad_norm": 0.2255859375, + "learning_rate": 0.00011130235226620769, + "loss": 0.958, + "step": 970 + }, + { + "epoch": 0.06, + "grad_norm": 0.2412109375, + "learning_rate": 0.00011187607573149743, + "loss": 0.9215, + "step": 975 + }, + { + "epoch": 0.06, + "grad_norm": 0.220703125, + "learning_rate": 0.00011244979919678715, + "loss": 0.9653, + "step": 980 + }, + { + "epoch": 0.06, + "grad_norm": 0.2216796875, + "learning_rate": 0.00011302352266207687, + "loss": 0.9603, + "step": 985 + }, + { + "epoch": 0.06, + "grad_norm": 0.220703125, + "learning_rate": 0.00011359724612736662, + "loss": 0.9093, + "step": 990 + }, + { + "epoch": 0.06, + "grad_norm": 0.2236328125, + "learning_rate": 0.00011417096959265634, + "loss": 0.9243, + "step": 995 + }, + { + "epoch": 0.06, + "grad_norm": 0.2333984375, + "learning_rate": 0.00011474469305794609, + "loss": 1.0176, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 0.21875, + "learning_rate": 0.0001153184165232358, + "loss": 0.9657, + "step": 1005 + }, + { + "epoch": 0.06, + "grad_norm": 0.2265625, + "learning_rate": 0.00011589213998852553, + "loss": 0.9962, + "step": 1010 + }, + { + "epoch": 0.06, + "grad_norm": 0.23828125, + "learning_rate": 0.00011646586345381527, + "loss": 1.022, + "step": 1015 + }, + { + "epoch": 0.06, + "grad_norm": 0.236328125, + "learning_rate": 0.000117039586919105, + "loss": 0.9741, + "step": 1020 + }, + { + "epoch": 0.06, + "grad_norm": 0.22265625, + "learning_rate": 0.00011761331038439474, + "loss": 0.9569, + "step": 1025 + }, + { + "epoch": 0.06, + "grad_norm": 0.212890625, + "learning_rate": 0.00011818703384968446, + "loss": 0.9687, + "step": 1030 + }, + { + "epoch": 0.06, + "grad_norm": 0.2197265625, + "learning_rate": 0.00011876075731497418, + "loss": 0.9592, + "step": 1035 + }, + { + "epoch": 0.06, + "grad_norm": 0.205078125, + "learning_rate": 0.00011933448078026393, + "loss": 0.9529, + "step": 1040 + }, + { + "epoch": 0.06, + "grad_norm": 0.2099609375, + "learning_rate": 0.00011990820424555365, + "loss": 1.077, + "step": 1045 + }, + { + "epoch": 0.06, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001204819277108434, + "loss": 0.9984, + "step": 1050 + }, + { + "epoch": 0.06, + "grad_norm": 0.25, + "learning_rate": 0.00012105565117613311, + "loss": 0.9589, + "step": 1055 + }, + { + "epoch": 0.06, + "grad_norm": 0.2236328125, + "learning_rate": 0.00012162937464142283, + "loss": 0.9386, + "step": 1060 + }, + { + "epoch": 0.06, + "grad_norm": 0.2265625, + "learning_rate": 0.00012220309810671257, + "loss": 1.0137, + "step": 1065 + }, + { + "epoch": 0.06, + "grad_norm": 0.2265625, + "learning_rate": 0.0001227768215720023, + "loss": 0.8944, + "step": 1070 + }, + { + "epoch": 0.06, + "grad_norm": 0.22265625, + "learning_rate": 0.000123350545037292, + "loss": 0.9208, + "step": 1075 + }, + { + "epoch": 0.06, + "grad_norm": 0.2236328125, + "learning_rate": 0.00012392426850258177, + "loss": 0.9763, + "step": 1080 + }, + { + "epoch": 0.06, + "grad_norm": 0.2158203125, + "learning_rate": 0.00012449799196787148, + "loss": 0.9631, + "step": 1085 + }, + { + "epoch": 0.06, + "grad_norm": 0.2138671875, + "learning_rate": 0.00012507171543316124, + "loss": 0.9173, + "step": 1090 + }, + { + "epoch": 0.06, + "grad_norm": 0.318359375, + "learning_rate": 0.00012564543889845094, + "loss": 0.9671, + "step": 1095 + }, + { + "epoch": 0.06, + "grad_norm": 0.23046875, + "learning_rate": 0.00012621916236374068, + "loss": 0.9751, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 0.234375, + "learning_rate": 0.0001267928858290304, + "loss": 0.9351, + "step": 1105 + }, + { + "epoch": 0.06, + "grad_norm": 0.2216796875, + "learning_rate": 0.00012736660929432014, + "loss": 0.9701, + "step": 1110 + }, + { + "epoch": 0.06, + "grad_norm": 0.2119140625, + "learning_rate": 0.00012794033275960988, + "loss": 0.9568, + "step": 1115 + }, + { + "epoch": 0.06, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001285140562248996, + "loss": 0.9354, + "step": 1120 + }, + { + "epoch": 0.06, + "grad_norm": 0.2158203125, + "learning_rate": 0.00012908777969018932, + "loss": 0.9803, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 0.2275390625, + "learning_rate": 0.00012966150315547908, + "loss": 0.9745, + "step": 1130 + }, + { + "epoch": 0.07, + "grad_norm": 0.2216796875, + "learning_rate": 0.00013023522662076878, + "loss": 1.0447, + "step": 1135 + }, + { + "epoch": 0.07, + "grad_norm": 0.2109375, + "learning_rate": 0.00013080895008605854, + "loss": 0.9392, + "step": 1140 + }, + { + "epoch": 0.07, + "grad_norm": 0.21875, + "learning_rate": 0.00013138267355134825, + "loss": 1.0016, + "step": 1145 + }, + { + "epoch": 0.07, + "grad_norm": 0.224609375, + "learning_rate": 0.00013195639701663798, + "loss": 1.0061, + "step": 1150 + }, + { + "epoch": 0.07, + "grad_norm": 0.21484375, + "learning_rate": 0.00013253012048192772, + "loss": 0.9837, + "step": 1155 + }, + { + "epoch": 0.07, + "grad_norm": 0.23828125, + "learning_rate": 0.00013310384394721745, + "loss": 1.0022, + "step": 1160 + }, + { + "epoch": 0.07, + "grad_norm": 0.21875, + "learning_rate": 0.00013367756741250719, + "loss": 0.978, + "step": 1165 + }, + { + "epoch": 0.07, + "grad_norm": 0.220703125, + "learning_rate": 0.00013425129087779692, + "loss": 0.9534, + "step": 1170 + }, + { + "epoch": 0.07, + "grad_norm": 0.220703125, + "learning_rate": 0.00013482501434308663, + "loss": 0.9628, + "step": 1175 + }, + { + "epoch": 0.07, + "grad_norm": 0.2177734375, + "learning_rate": 0.00013539873780837639, + "loss": 0.9247, + "step": 1180 + }, + { + "epoch": 0.07, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001359724612736661, + "loss": 0.9551, + "step": 1185 + }, + { + "epoch": 0.07, + "grad_norm": 0.2119140625, + "learning_rate": 0.00013654618473895585, + "loss": 0.9261, + "step": 1190 + }, + { + "epoch": 0.07, + "grad_norm": 0.2255859375, + "learning_rate": 0.00013711990820424556, + "loss": 1.0057, + "step": 1195 + }, + { + "epoch": 0.07, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001376936316695353, + "loss": 0.9528, + "step": 1200 + }, + { + "epoch": 0.07, + "grad_norm": 0.224609375, + "learning_rate": 0.00013826735513482503, + "loss": 0.9443, + "step": 1205 + }, + { + "epoch": 0.07, + "grad_norm": 0.228515625, + "learning_rate": 0.00013884107860011476, + "loss": 0.9167, + "step": 1210 + }, + { + "epoch": 0.07, + "grad_norm": 0.220703125, + "learning_rate": 0.00013941480206540447, + "loss": 0.95, + "step": 1215 + }, + { + "epoch": 0.07, + "grad_norm": 0.2080078125, + "learning_rate": 0.00013998852553069423, + "loss": 0.9668, + "step": 1220 + }, + { + "epoch": 0.07, + "grad_norm": 0.2109375, + "learning_rate": 0.00014056224899598393, + "loss": 0.9145, + "step": 1225 + }, + { + "epoch": 0.07, + "grad_norm": 0.2236328125, + "learning_rate": 0.00014113597246127367, + "loss": 0.9576, + "step": 1230 + }, + { + "epoch": 0.07, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001417096959265634, + "loss": 0.9274, + "step": 1235 + }, + { + "epoch": 0.07, + "grad_norm": 0.21484375, + "learning_rate": 0.00014228341939185313, + "loss": 0.9507, + "step": 1240 + }, + { + "epoch": 0.07, + "grad_norm": 0.2333984375, + "learning_rate": 0.00014285714285714287, + "loss": 0.9324, + "step": 1245 + }, + { + "epoch": 0.07, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001434308663224326, + "loss": 1.0164, + "step": 1250 + }, + { + "epoch": 0.07, + "grad_norm": 0.2333984375, + "learning_rate": 0.00014400458978772233, + "loss": 0.9863, + "step": 1255 + }, + { + "epoch": 0.07, + "grad_norm": 0.21875, + "learning_rate": 0.00014457831325301204, + "loss": 0.9704, + "step": 1260 + }, + { + "epoch": 0.07, + "grad_norm": 0.216796875, + "learning_rate": 0.00014515203671830177, + "loss": 1.0213, + "step": 1265 + }, + { + "epoch": 0.07, + "grad_norm": 0.212890625, + "learning_rate": 0.0001457257601835915, + "loss": 0.9403, + "step": 1270 + }, + { + "epoch": 0.07, + "grad_norm": 0.2236328125, + "learning_rate": 0.00014629948364888124, + "loss": 0.9601, + "step": 1275 + }, + { + "epoch": 0.07, + "grad_norm": 0.232421875, + "learning_rate": 0.00014687320711417098, + "loss": 1.0241, + "step": 1280 + }, + { + "epoch": 0.07, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001474469305794607, + "loss": 1.0017, + "step": 1285 + }, + { + "epoch": 0.07, + "grad_norm": 0.2216796875, + "learning_rate": 0.00014802065404475042, + "loss": 0.9326, + "step": 1290 + }, + { + "epoch": 0.07, + "grad_norm": 0.2353515625, + "learning_rate": 0.00014859437751004018, + "loss": 1.0238, + "step": 1295 + }, + { + "epoch": 0.07, + "grad_norm": 0.2265625, + "learning_rate": 0.00014916810097532988, + "loss": 0.9449, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 0.2255859375, + "learning_rate": 0.00014974182444061964, + "loss": 0.9592, + "step": 1305 + }, + { + "epoch": 0.08, + "grad_norm": 0.22265625, + "learning_rate": 0.00015031554790590935, + "loss": 0.9213, + "step": 1310 + }, + { + "epoch": 0.08, + "grad_norm": 0.2109375, + "learning_rate": 0.00015088927137119908, + "loss": 0.9286, + "step": 1315 + }, + { + "epoch": 0.08, + "grad_norm": 0.228515625, + "learning_rate": 0.00015146299483648882, + "loss": 0.9357, + "step": 1320 + }, + { + "epoch": 0.08, + "grad_norm": 0.234375, + "learning_rate": 0.00015203671830177855, + "loss": 0.9788, + "step": 1325 + }, + { + "epoch": 0.08, + "grad_norm": 0.2236328125, + "learning_rate": 0.00015261044176706828, + "loss": 0.9794, + "step": 1330 + }, + { + "epoch": 0.08, + "grad_norm": 0.234375, + "learning_rate": 0.00015318416523235802, + "loss": 0.9329, + "step": 1335 + }, + { + "epoch": 0.08, + "grad_norm": 0.2177734375, + "learning_rate": 0.00015375788869764772, + "loss": 0.9709, + "step": 1340 + }, + { + "epoch": 0.08, + "grad_norm": 0.2451171875, + "learning_rate": 0.00015433161216293748, + "loss": 1.0187, + "step": 1345 + }, + { + "epoch": 0.08, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001549053356282272, + "loss": 0.9459, + "step": 1350 + }, + { + "epoch": 0.08, + "grad_norm": 0.2275390625, + "learning_rate": 0.00015547905909351695, + "loss": 0.9776, + "step": 1355 + }, + { + "epoch": 0.08, + "grad_norm": 0.2265625, + "learning_rate": 0.00015605278255880666, + "loss": 0.9328, + "step": 1360 + }, + { + "epoch": 0.08, + "grad_norm": 0.21875, + "learning_rate": 0.0001566265060240964, + "loss": 1.001, + "step": 1365 + }, + { + "epoch": 0.08, + "grad_norm": 0.21875, + "learning_rate": 0.00015720022948938613, + "loss": 0.9298, + "step": 1370 + }, + { + "epoch": 0.08, + "grad_norm": 0.2197265625, + "learning_rate": 0.00015777395295467586, + "loss": 0.9574, + "step": 1375 + }, + { + "epoch": 0.08, + "grad_norm": 0.2470703125, + "learning_rate": 0.00015834767641996557, + "loss": 0.9998, + "step": 1380 + }, + { + "epoch": 0.08, + "grad_norm": 0.2431640625, + "learning_rate": 0.00015892139988525533, + "loss": 0.9435, + "step": 1385 + }, + { + "epoch": 0.08, + "grad_norm": 0.2275390625, + "learning_rate": 0.00015949512335054503, + "loss": 0.9459, + "step": 1390 + }, + { + "epoch": 0.08, + "grad_norm": 0.2109375, + "learning_rate": 0.0001600688468158348, + "loss": 0.9581, + "step": 1395 + }, + { + "epoch": 0.08, + "grad_norm": 0.21875, + "learning_rate": 0.0001606425702811245, + "loss": 0.9255, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 0.2275390625, + "learning_rate": 0.00016121629374641423, + "loss": 1.0634, + "step": 1405 + }, + { + "epoch": 0.08, + "grad_norm": 0.224609375, + "learning_rate": 0.00016179001721170397, + "loss": 1.0368, + "step": 1410 + }, + { + "epoch": 0.08, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001623637406769937, + "loss": 0.9813, + "step": 1415 + }, + { + "epoch": 0.08, + "grad_norm": 0.2216796875, + "learning_rate": 0.00016293746414228343, + "loss": 0.9923, + "step": 1420 + }, + { + "epoch": 0.08, + "grad_norm": 0.2314453125, + "learning_rate": 0.00016351118760757317, + "loss": 0.9478, + "step": 1425 + }, + { + "epoch": 0.08, + "grad_norm": 0.216796875, + "learning_rate": 0.00016408491107286287, + "loss": 1.034, + "step": 1430 + }, + { + "epoch": 0.08, + "grad_norm": 0.23046875, + "learning_rate": 0.00016465863453815263, + "loss": 0.9275, + "step": 1435 + }, + { + "epoch": 0.08, + "grad_norm": 0.251953125, + "learning_rate": 0.00016523235800344234, + "loss": 1.0333, + "step": 1440 + }, + { + "epoch": 0.08, + "grad_norm": 0.234375, + "learning_rate": 0.0001658060814687321, + "loss": 0.9415, + "step": 1445 + }, + { + "epoch": 0.08, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001663798049340218, + "loss": 0.9903, + "step": 1450 + }, + { + "epoch": 0.08, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016695352839931154, + "loss": 0.9824, + "step": 1455 + }, + { + "epoch": 0.08, + "grad_norm": 0.2265625, + "learning_rate": 0.00016752725186460127, + "loss": 0.8981, + "step": 1460 + }, + { + "epoch": 0.08, + "grad_norm": 0.23046875, + "learning_rate": 0.000168100975329891, + "loss": 0.9891, + "step": 1465 + }, + { + "epoch": 0.08, + "grad_norm": 0.2421875, + "learning_rate": 0.00016867469879518074, + "loss": 1.0248, + "step": 1470 + }, + { + "epoch": 0.08, + "grad_norm": 0.2451171875, + "learning_rate": 0.00016924842226047048, + "loss": 1.0793, + "step": 1475 + }, + { + "epoch": 0.08, + "grad_norm": 0.232421875, + "learning_rate": 0.00016982214572576018, + "loss": 0.9942, + "step": 1480 + }, + { + "epoch": 0.09, + "grad_norm": 0.2236328125, + "learning_rate": 0.00017039586919104992, + "loss": 0.9166, + "step": 1485 + }, + { + "epoch": 0.09, + "grad_norm": 0.2099609375, + "learning_rate": 0.00017096959265633965, + "loss": 0.9555, + "step": 1490 + }, + { + "epoch": 0.09, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017154331612162938, + "loss": 0.897, + "step": 1495 + }, + { + "epoch": 0.09, + "grad_norm": 0.2236328125, + "learning_rate": 0.00017211703958691912, + "loss": 1.0034, + "step": 1500 + }, + { + "epoch": 0.09, + "grad_norm": 0.224609375, + "learning_rate": 0.00017269076305220885, + "loss": 1.0096, + "step": 1505 + }, + { + "epoch": 0.09, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017326448651749858, + "loss": 1.0415, + "step": 1510 + }, + { + "epoch": 0.09, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001738382099827883, + "loss": 0.9825, + "step": 1515 + }, + { + "epoch": 0.09, + "grad_norm": 0.24609375, + "learning_rate": 0.00017441193344807802, + "loss": 0.994, + "step": 1520 + }, + { + "epoch": 0.09, + "grad_norm": 0.23828125, + "learning_rate": 0.00017498565691336776, + "loss": 1.0082, + "step": 1525 + }, + { + "epoch": 0.09, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001755593803786575, + "loss": 0.9467, + "step": 1530 + }, + { + "epoch": 0.09, + "grad_norm": 0.23046875, + "learning_rate": 0.00017613310384394722, + "loss": 0.895, + "step": 1535 + }, + { + "epoch": 0.09, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017670682730923696, + "loss": 0.9505, + "step": 1540 + }, + { + "epoch": 0.09, + "grad_norm": 0.232421875, + "learning_rate": 0.00017728055077452666, + "loss": 1.0304, + "step": 1545 + }, + { + "epoch": 0.09, + "grad_norm": 0.236328125, + "learning_rate": 0.00017785427423981642, + "loss": 0.9931, + "step": 1550 + }, + { + "epoch": 0.09, + "grad_norm": 0.240234375, + "learning_rate": 0.00017842799770510613, + "loss": 0.9413, + "step": 1555 + }, + { + "epoch": 0.09, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001790017211703959, + "loss": 0.9221, + "step": 1560 + }, + { + "epoch": 0.09, + "grad_norm": 0.240234375, + "learning_rate": 0.0001795754446356856, + "loss": 0.9809, + "step": 1565 + }, + { + "epoch": 0.09, + "grad_norm": 0.236328125, + "learning_rate": 0.00018014916810097533, + "loss": 0.9561, + "step": 1570 + }, + { + "epoch": 0.09, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018072289156626507, + "loss": 0.9034, + "step": 1575 + }, + { + "epoch": 0.09, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001812966150315548, + "loss": 0.9123, + "step": 1580 + }, + { + "epoch": 0.09, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018187033849684453, + "loss": 0.9413, + "step": 1585 + }, + { + "epoch": 0.09, + "grad_norm": 0.25390625, + "learning_rate": 0.00018244406196213427, + "loss": 0.9556, + "step": 1590 + }, + { + "epoch": 0.09, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018301778542742397, + "loss": 0.9516, + "step": 1595 + }, + { + "epoch": 0.09, + "grad_norm": 0.2421875, + "learning_rate": 0.00018359150889271373, + "loss": 1.0159, + "step": 1600 + }, + { + "epoch": 0.09, + "grad_norm": 0.228515625, + "learning_rate": 0.00018416523235800344, + "loss": 0.9851, + "step": 1605 + }, + { + "epoch": 0.09, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001847389558232932, + "loss": 0.9938, + "step": 1610 + }, + { + "epoch": 0.09, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001853126792885829, + "loss": 1.0143, + "step": 1615 + }, + { + "epoch": 0.09, + "grad_norm": 0.240234375, + "learning_rate": 0.00018588640275387264, + "loss": 0.9578, + "step": 1620 + }, + { + "epoch": 0.09, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018646012621916237, + "loss": 0.9961, + "step": 1625 + }, + { + "epoch": 0.09, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001870338496844521, + "loss": 0.9975, + "step": 1630 + }, + { + "epoch": 0.09, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018760757314974184, + "loss": 0.9234, + "step": 1635 + }, + { + "epoch": 0.09, + "grad_norm": 0.240234375, + "learning_rate": 0.00018818129661503157, + "loss": 0.9473, + "step": 1640 + }, + { + "epoch": 0.09, + "grad_norm": 0.251953125, + "learning_rate": 0.00018875502008032128, + "loss": 1.0179, + "step": 1645 + }, + { + "epoch": 0.09, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018932874354561104, + "loss": 0.8971, + "step": 1650 + }, + { + "epoch": 0.09, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018990246701090075, + "loss": 0.9953, + "step": 1655 + }, + { + "epoch": 0.1, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019047619047619048, + "loss": 0.9251, + "step": 1660 + }, + { + "epoch": 0.1, + "grad_norm": 0.232421875, + "learning_rate": 0.00019104991394148021, + "loss": 0.9521, + "step": 1665 + }, + { + "epoch": 0.1, + "grad_norm": 0.23046875, + "learning_rate": 0.00019162363740676995, + "loss": 0.9647, + "step": 1670 + }, + { + "epoch": 0.1, + "grad_norm": 0.240234375, + "learning_rate": 0.00019219736087205968, + "loss": 0.9525, + "step": 1675 + }, + { + "epoch": 0.1, + "grad_norm": 0.24609375, + "learning_rate": 0.00019277108433734942, + "loss": 0.9947, + "step": 1680 + }, + { + "epoch": 0.1, + "grad_norm": 0.240234375, + "learning_rate": 0.00019334480780263912, + "loss": 0.9591, + "step": 1685 + }, + { + "epoch": 0.1, + "grad_norm": 0.236328125, + "learning_rate": 0.00019391853126792888, + "loss": 0.9734, + "step": 1690 + }, + { + "epoch": 0.1, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001944922547332186, + "loss": 1.0063, + "step": 1695 + }, + { + "epoch": 0.1, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019506597819850835, + "loss": 0.9684, + "step": 1700 + }, + { + "epoch": 0.1, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019563970166379806, + "loss": 0.9499, + "step": 1705 + }, + { + "epoch": 0.1, + "grad_norm": 0.23828125, + "learning_rate": 0.0001962134251290878, + "loss": 0.9455, + "step": 1710 + }, + { + "epoch": 0.1, + "grad_norm": 0.25390625, + "learning_rate": 0.00019678714859437752, + "loss": 0.9922, + "step": 1715 + }, + { + "epoch": 0.1, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019736087205966726, + "loss": 1.0273, + "step": 1720 + }, + { + "epoch": 0.1, + "grad_norm": 0.2412109375, + "learning_rate": 0.000197934595524957, + "loss": 1.0015, + "step": 1725 + }, + { + "epoch": 0.1, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019850831899024672, + "loss": 0.9356, + "step": 1730 + }, + { + "epoch": 0.1, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019908204245553643, + "loss": 0.9609, + "step": 1735 + }, + { + "epoch": 0.1, + "grad_norm": 0.23046875, + "learning_rate": 0.00019965576592082616, + "loss": 0.9577, + "step": 1740 + }, + { + "epoch": 0.1, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019999999197655853, + "loss": 0.9813, + "step": 1745 + }, + { + "epoch": 0.1, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001999999017128567, + "loss": 0.9655, + "step": 1750 + }, + { + "epoch": 0.1, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019999971115624204, + "loss": 0.9365, + "step": 1755 + }, + { + "epoch": 0.1, + "grad_norm": 0.228515625, + "learning_rate": 0.00019999942030690567, + "loss": 1.0163, + "step": 1760 + }, + { + "epoch": 0.1, + "grad_norm": 0.244140625, + "learning_rate": 0.0001999990291651393, + "loss": 0.9904, + "step": 1765 + }, + { + "epoch": 0.1, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019999853773133513, + "loss": 0.9746, + "step": 1770 + }, + { + "epoch": 0.1, + "grad_norm": 0.271484375, + "learning_rate": 0.00019999794600598616, + "loss": 1.0293, + "step": 1775 + }, + { + "epoch": 0.1, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019999725398968577, + "loss": 0.928, + "step": 1780 + }, + { + "epoch": 0.1, + "grad_norm": 0.244140625, + "learning_rate": 0.000199996461683128, + "loss": 0.967, + "step": 1785 + }, + { + "epoch": 0.1, + "grad_norm": 0.22265625, + "learning_rate": 0.00019999556908710753, + "loss": 0.9306, + "step": 1790 + }, + { + "epoch": 0.1, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019999457620251953, + "loss": 0.8565, + "step": 1795 + }, + { + "epoch": 0.1, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001999934830303598, + "loss": 0.9218, + "step": 1800 + }, + { + "epoch": 0.1, + "grad_norm": 0.232421875, + "learning_rate": 0.00019999228957172477, + "loss": 0.8999, + "step": 1805 + }, + { + "epoch": 0.1, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001999909958278113, + "loss": 0.958, + "step": 1810 + }, + { + "epoch": 0.1, + "grad_norm": 0.23828125, + "learning_rate": 0.000199989601799917, + "loss": 0.9169, + "step": 1815 + }, + { + "epoch": 0.1, + "grad_norm": 0.248046875, + "learning_rate": 0.00019998810748943994, + "loss": 0.9944, + "step": 1820 + }, + { + "epoch": 0.1, + "grad_norm": 0.255859375, + "learning_rate": 0.00019998651289787885, + "loss": 0.9755, + "step": 1825 + }, + { + "epoch": 0.11, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019998481802683293, + "loss": 0.9872, + "step": 1830 + }, + { + "epoch": 0.11, + "grad_norm": 0.263671875, + "learning_rate": 0.00019998302287800208, + "loss": 0.9734, + "step": 1835 + }, + { + "epoch": 0.11, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019998112745318669, + "loss": 0.9889, + "step": 1840 + }, + { + "epoch": 0.11, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019997913175428771, + "loss": 0.9554, + "step": 1845 + }, + { + "epoch": 0.11, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019997703578330674, + "loss": 1.0364, + "step": 1850 + }, + { + "epoch": 0.11, + "grad_norm": 0.2265625, + "learning_rate": 0.00019997483954234583, + "loss": 0.9727, + "step": 1855 + }, + { + "epoch": 0.11, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019997254303360772, + "loss": 0.897, + "step": 1860 + }, + { + "epoch": 0.11, + "grad_norm": 0.251953125, + "learning_rate": 0.0001999701462593956, + "loss": 0.9793, + "step": 1865 + }, + { + "epoch": 0.11, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019996764922211328, + "loss": 0.9048, + "step": 1870 + }, + { + "epoch": 0.11, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001999650519242651, + "loss": 0.977, + "step": 1875 + }, + { + "epoch": 0.11, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019996235436845603, + "loss": 0.9698, + "step": 1880 + }, + { + "epoch": 0.11, + "grad_norm": 0.2421875, + "learning_rate": 0.00019995955655739147, + "loss": 0.9077, + "step": 1885 + }, + { + "epoch": 0.11, + "grad_norm": 0.24609375, + "learning_rate": 0.00019995665849387742, + "loss": 0.9419, + "step": 1890 + }, + { + "epoch": 0.11, + "grad_norm": 0.220703125, + "learning_rate": 0.0001999536601808205, + "loss": 0.9445, + "step": 1895 + }, + { + "epoch": 0.11, + "grad_norm": 0.224609375, + "learning_rate": 0.00019995056162122775, + "loss": 1.0352, + "step": 1900 + }, + { + "epoch": 0.11, + "grad_norm": 0.236328125, + "learning_rate": 0.0001999473628182068, + "loss": 1.0013, + "step": 1905 + }, + { + "epoch": 0.11, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019994406377496588, + "loss": 0.9071, + "step": 1910 + }, + { + "epoch": 0.11, + "grad_norm": 0.251953125, + "learning_rate": 0.00019994066449481368, + "loss": 0.9322, + "step": 1915 + }, + { + "epoch": 0.11, + "grad_norm": 0.25, + "learning_rate": 0.0001999371649811594, + "loss": 0.8917, + "step": 1920 + }, + { + "epoch": 0.11, + "grad_norm": 0.2578125, + "learning_rate": 0.00019993356523751287, + "loss": 0.9989, + "step": 1925 + }, + { + "epoch": 0.11, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019992986526748432, + "loss": 0.8982, + "step": 1930 + }, + { + "epoch": 0.11, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019992606507478463, + "loss": 0.9933, + "step": 1935 + }, + { + "epoch": 0.11, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001999221646632251, + "loss": 0.9206, + "step": 1940 + }, + { + "epoch": 0.11, + "grad_norm": 0.26171875, + "learning_rate": 0.0001999181640367175, + "loss": 1.0054, + "step": 1945 + }, + { + "epoch": 0.11, + "grad_norm": 0.2265625, + "learning_rate": 0.00019991406319927428, + "loss": 0.9391, + "step": 1950 + }, + { + "epoch": 0.11, + "grad_norm": 0.251953125, + "learning_rate": 0.00019990986215500826, + "loss": 0.9102, + "step": 1955 + }, + { + "epoch": 0.11, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019990556090813277, + "loss": 0.9721, + "step": 1960 + }, + { + "epoch": 0.11, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019990115946296168, + "loss": 0.9735, + "step": 1965 + }, + { + "epoch": 0.11, + "grad_norm": 0.2421875, + "learning_rate": 0.00019989665782390933, + "loss": 0.9573, + "step": 1970 + }, + { + "epoch": 0.11, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019989205599549057, + "loss": 0.972, + "step": 1975 + }, + { + "epoch": 0.11, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019988735398232066, + "loss": 0.9919, + "step": 1980 + }, + { + "epoch": 0.11, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019988255178911543, + "loss": 0.9882, + "step": 1985 + }, + { + "epoch": 0.11, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019987764942069113, + "loss": 0.9546, + "step": 1990 + }, + { + "epoch": 0.11, + "grad_norm": 0.234375, + "learning_rate": 0.00019987264688196453, + "loss": 0.994, + "step": 1995 + }, + { + "epoch": 0.11, + "grad_norm": 0.236328125, + "learning_rate": 0.00019986754417795278, + "loss": 0.9044, + "step": 2000 + }, + { + "epoch": 0.12, + "grad_norm": 0.26171875, + "learning_rate": 0.00019986234131377353, + "loss": 0.9218, + "step": 2005 + }, + { + "epoch": 0.12, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019985703829464492, + "loss": 1.0188, + "step": 2010 + }, + { + "epoch": 0.12, + "grad_norm": 0.259765625, + "learning_rate": 0.0001998516351258855, + "loss": 0.9972, + "step": 2015 + }, + { + "epoch": 0.12, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019984613181291426, + "loss": 0.9235, + "step": 2020 + }, + { + "epoch": 0.12, + "grad_norm": 0.248046875, + "learning_rate": 0.00019984052836125065, + "loss": 0.9297, + "step": 2025 + }, + { + "epoch": 0.12, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019983482477651455, + "loss": 0.8868, + "step": 2030 + }, + { + "epoch": 0.12, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019982902106442622, + "loss": 0.972, + "step": 2035 + }, + { + "epoch": 0.12, + "grad_norm": 0.25, + "learning_rate": 0.00019982311723080643, + "loss": 0.96, + "step": 2040 + }, + { + "epoch": 0.12, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019981711328157626, + "loss": 0.8752, + "step": 2045 + }, + { + "epoch": 0.12, + "grad_norm": 0.248046875, + "learning_rate": 0.00019981100922275728, + "loss": 0.9826, + "step": 2050 + }, + { + "epoch": 0.12, + "grad_norm": 0.24609375, + "learning_rate": 0.00019980480506047143, + "loss": 1.0445, + "step": 2055 + }, + { + "epoch": 0.12, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019979850080094108, + "loss": 0.9625, + "step": 2060 + }, + { + "epoch": 0.12, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019979209645048889, + "loss": 0.9338, + "step": 2065 + }, + { + "epoch": 0.12, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019978559201553803, + "loss": 0.9259, + "step": 2070 + }, + { + "epoch": 0.12, + "grad_norm": 0.2412109375, + "learning_rate": 0.000199778987502612, + "loss": 0.9573, + "step": 2075 + }, + { + "epoch": 0.12, + "grad_norm": 0.24609375, + "learning_rate": 0.00019977228291833462, + "loss": 1.0202, + "step": 2080 + }, + { + "epoch": 0.12, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019976547826943019, + "loss": 0.9039, + "step": 2085 + }, + { + "epoch": 0.12, + "grad_norm": 0.26953125, + "learning_rate": 0.00019975857356272322, + "loss": 0.9118, + "step": 2090 + }, + { + "epoch": 0.12, + "grad_norm": 0.251953125, + "learning_rate": 0.00019975156880513867, + "loss": 0.9077, + "step": 2095 + }, + { + "epoch": 0.12, + "grad_norm": 0.2578125, + "learning_rate": 0.00019974446400370185, + "loss": 0.9197, + "step": 2100 + }, + { + "epoch": 0.12, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019973725916553838, + "loss": 0.9025, + "step": 2105 + }, + { + "epoch": 0.12, + "grad_norm": 0.310546875, + "learning_rate": 0.00019972995429787415, + "loss": 0.9744, + "step": 2110 + }, + { + "epoch": 0.12, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019972254940803553, + "loss": 0.922, + "step": 2115 + }, + { + "epoch": 0.12, + "grad_norm": 0.240234375, + "learning_rate": 0.00019971504450344902, + "loss": 0.994, + "step": 2120 + }, + { + "epoch": 0.12, + "grad_norm": 0.236328125, + "learning_rate": 0.00019970743959164155, + "loss": 0.9548, + "step": 2125 + }, + { + "epoch": 0.12, + "grad_norm": 0.255859375, + "learning_rate": 0.00019969973468024032, + "loss": 0.9639, + "step": 2130 + }, + { + "epoch": 0.12, + "grad_norm": 0.251953125, + "learning_rate": 0.0001996919297769728, + "loss": 0.9711, + "step": 2135 + }, + { + "epoch": 0.12, + "grad_norm": 0.25, + "learning_rate": 0.00019968402488966677, + "loss": 0.8984, + "step": 2140 + }, + { + "epoch": 0.12, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019967602002625027, + "loss": 0.9091, + "step": 2145 + }, + { + "epoch": 0.12, + "grad_norm": 0.2578125, + "learning_rate": 0.00019966791519475169, + "loss": 0.9957, + "step": 2150 + }, + { + "epoch": 0.12, + "grad_norm": 0.240234375, + "learning_rate": 0.00019965971040329948, + "loss": 0.9527, + "step": 2155 + }, + { + "epoch": 0.12, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019965140566012253, + "loss": 0.8977, + "step": 2160 + }, + { + "epoch": 0.12, + "grad_norm": 0.26953125, + "learning_rate": 0.00019964300097355, + "loss": 0.9534, + "step": 2165 + }, + { + "epoch": 0.12, + "grad_norm": 0.248046875, + "learning_rate": 0.0001996344963520111, + "loss": 0.9924, + "step": 2170 + }, + { + "epoch": 0.12, + "grad_norm": 0.236328125, + "learning_rate": 0.00019962589180403542, + "loss": 0.9923, + "step": 2175 + }, + { + "epoch": 0.13, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019961718733825268, + "loss": 0.957, + "step": 2180 + }, + { + "epoch": 0.13, + "grad_norm": 0.23828125, + "learning_rate": 0.00019960838296339287, + "loss": 0.9071, + "step": 2185 + }, + { + "epoch": 0.13, + "grad_norm": 0.25390625, + "learning_rate": 0.00019959947868828618, + "loss": 0.9991, + "step": 2190 + }, + { + "epoch": 0.13, + "grad_norm": 0.25, + "learning_rate": 0.00019959047452186294, + "loss": 1.0085, + "step": 2195 + }, + { + "epoch": 0.13, + "grad_norm": 0.240234375, + "learning_rate": 0.00019958137047315375, + "loss": 0.9302, + "step": 2200 + }, + { + "epoch": 0.13, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001995721665512893, + "loss": 0.9795, + "step": 2205 + }, + { + "epoch": 0.13, + "grad_norm": 0.23046875, + "learning_rate": 0.00019956286276550048, + "loss": 0.943, + "step": 2210 + }, + { + "epoch": 0.13, + "grad_norm": 0.2578125, + "learning_rate": 0.00019955345912511837, + "loss": 0.9711, + "step": 2215 + }, + { + "epoch": 0.13, + "grad_norm": 0.228515625, + "learning_rate": 0.0001995439556395741, + "loss": 0.891, + "step": 2220 + }, + { + "epoch": 0.13, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019953435231839906, + "loss": 0.9991, + "step": 2225 + }, + { + "epoch": 0.13, + "grad_norm": 0.232421875, + "learning_rate": 0.0001995246491712247, + "loss": 0.8978, + "step": 2230 + }, + { + "epoch": 0.13, + "grad_norm": 0.244140625, + "learning_rate": 0.00019951484620778258, + "loss": 0.989, + "step": 2235 + }, + { + "epoch": 0.13, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019950494343790438, + "loss": 0.9474, + "step": 2240 + }, + { + "epoch": 0.13, + "grad_norm": 0.259765625, + "learning_rate": 0.00019949494087152193, + "loss": 0.8953, + "step": 2245 + }, + { + "epoch": 0.13, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019948483851866703, + "loss": 1.0249, + "step": 2250 + }, + { + "epoch": 0.13, + "grad_norm": 0.236328125, + "learning_rate": 0.0001994746363894717, + "loss": 0.9422, + "step": 2255 + }, + { + "epoch": 0.13, + "grad_norm": 0.240234375, + "learning_rate": 0.000199464334494168, + "loss": 0.8769, + "step": 2260 + }, + { + "epoch": 0.13, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019945393284308792, + "loss": 1.0703, + "step": 2265 + }, + { + "epoch": 0.13, + "grad_norm": 0.244140625, + "learning_rate": 0.0001994434314466636, + "loss": 0.921, + "step": 2270 + }, + { + "epoch": 0.13, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019943283031542726, + "loss": 0.9315, + "step": 2275 + }, + { + "epoch": 0.13, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019942212946001103, + "loss": 1.0263, + "step": 2280 + }, + { + "epoch": 0.13, + "grad_norm": 0.236328125, + "learning_rate": 0.00019941132889114718, + "loss": 0.956, + "step": 2285 + }, + { + "epoch": 0.13, + "grad_norm": 0.248046875, + "learning_rate": 0.0001994004286196679, + "loss": 0.9692, + "step": 2290 + }, + { + "epoch": 0.13, + "grad_norm": 0.23046875, + "learning_rate": 0.0001993894286565054, + "loss": 0.882, + "step": 2295 + }, + { + "epoch": 0.13, + "grad_norm": 0.265625, + "learning_rate": 0.00019937832901269187, + "loss": 0.9997, + "step": 2300 + }, + { + "epoch": 0.13, + "grad_norm": 0.255859375, + "learning_rate": 0.00019936712969935948, + "loss": 0.98, + "step": 2305 + }, + { + "epoch": 0.13, + "grad_norm": 0.255859375, + "learning_rate": 0.00019935583072774039, + "loss": 1.0199, + "step": 2310 + }, + { + "epoch": 0.13, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001993444321091666, + "loss": 1.0151, + "step": 2315 + }, + { + "epoch": 0.13, + "grad_norm": 0.234375, + "learning_rate": 0.0001993329338550702, + "loss": 0.9283, + "step": 2320 + }, + { + "epoch": 0.13, + "grad_norm": 0.26171875, + "learning_rate": 0.00019932133597698313, + "loss": 0.9331, + "step": 2325 + }, + { + "epoch": 0.13, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001993096384865372, + "loss": 0.9331, + "step": 2330 + }, + { + "epoch": 0.13, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019929784139546426, + "loss": 0.9636, + "step": 2335 + }, + { + "epoch": 0.13, + "grad_norm": 0.2734375, + "learning_rate": 0.00019928594471559587, + "loss": 1.0361, + "step": 2340 + }, + { + "epoch": 0.13, + "grad_norm": 0.267578125, + "learning_rate": 0.00019927394845886361, + "loss": 1.0051, + "step": 2345 + }, + { + "epoch": 0.13, + "grad_norm": 0.251953125, + "learning_rate": 0.00019926185263729896, + "loss": 0.9548, + "step": 2350 + }, + { + "epoch": 0.14, + "grad_norm": 0.232421875, + "learning_rate": 0.00019924965726303304, + "loss": 0.8994, + "step": 2355 + }, + { + "epoch": 0.14, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001992373623482971, + "loss": 0.9032, + "step": 2360 + }, + { + "epoch": 0.14, + "grad_norm": 0.232421875, + "learning_rate": 0.00019922496790542195, + "loss": 0.9512, + "step": 2365 + }, + { + "epoch": 0.14, + "grad_norm": 0.26953125, + "learning_rate": 0.00019921247394683844, + "loss": 0.9407, + "step": 2370 + }, + { + "epoch": 0.14, + "grad_norm": 0.228515625, + "learning_rate": 0.00019919988048507713, + "loss": 1.025, + "step": 2375 + }, + { + "epoch": 0.14, + "grad_norm": 0.255859375, + "learning_rate": 0.00019918718753276834, + "loss": 0.9967, + "step": 2380 + }, + { + "epoch": 0.14, + "grad_norm": 0.24609375, + "learning_rate": 0.00019917439510264227, + "loss": 0.9103, + "step": 2385 + }, + { + "epoch": 0.14, + "grad_norm": 0.26171875, + "learning_rate": 0.0001991615032075288, + "loss": 0.982, + "step": 2390 + }, + { + "epoch": 0.14, + "grad_norm": 0.25, + "learning_rate": 0.00019914851186035755, + "loss": 0.9849, + "step": 2395 + }, + { + "epoch": 0.14, + "grad_norm": 0.25390625, + "learning_rate": 0.00019913542107415801, + "loss": 0.9227, + "step": 2400 + }, + { + "epoch": 0.14, + "grad_norm": 0.24609375, + "learning_rate": 0.00019912223086205932, + "loss": 0.9488, + "step": 2405 + }, + { + "epoch": 0.14, + "grad_norm": 0.24609375, + "learning_rate": 0.00019910894123729032, + "loss": 0.9723, + "step": 2410 + }, + { + "epoch": 0.14, + "grad_norm": 0.251953125, + "learning_rate": 0.00019909555221317955, + "loss": 1.0172, + "step": 2415 + }, + { + "epoch": 0.14, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019908206380315528, + "loss": 0.9484, + "step": 2420 + }, + { + "epoch": 0.14, + "grad_norm": 0.23046875, + "learning_rate": 0.00019906847602074547, + "loss": 0.8921, + "step": 2425 + }, + { + "epoch": 0.14, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019905478887957773, + "loss": 0.9606, + "step": 2430 + }, + { + "epoch": 0.14, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019904100239337923, + "loss": 0.9709, + "step": 2435 + }, + { + "epoch": 0.14, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019902711657597693, + "loss": 1.0452, + "step": 2440 + }, + { + "epoch": 0.14, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019901313144129727, + "loss": 0.9562, + "step": 2445 + }, + { + "epoch": 0.14, + "grad_norm": 0.251953125, + "learning_rate": 0.00019899904700336642, + "loss": 1.0301, + "step": 2450 + }, + { + "epoch": 0.14, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019898486327631006, + "loss": 1.0033, + "step": 2455 + }, + { + "epoch": 0.14, + "grad_norm": 0.2578125, + "learning_rate": 0.00019897058027435347, + "loss": 1.0009, + "step": 2460 + }, + { + "epoch": 0.14, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019895619801182148, + "loss": 1.0021, + "step": 2465 + }, + { + "epoch": 0.14, + "grad_norm": 0.2578125, + "learning_rate": 0.00019894171650313856, + "loss": 0.9669, + "step": 2470 + }, + { + "epoch": 0.14, + "grad_norm": 0.251953125, + "learning_rate": 0.00019892713576282863, + "loss": 1.0139, + "step": 2475 + }, + { + "epoch": 0.14, + "grad_norm": 0.236328125, + "learning_rate": 0.00019891245580551509, + "loss": 0.9629, + "step": 2480 + }, + { + "epoch": 0.14, + "grad_norm": 0.23828125, + "learning_rate": 0.000198897676645921, + "loss": 0.9778, + "step": 2485 + }, + { + "epoch": 0.14, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019888279829886877, + "loss": 1.0, + "step": 2490 + }, + { + "epoch": 0.14, + "grad_norm": 0.25, + "learning_rate": 0.00019886782077928038, + "loss": 0.9069, + "step": 2495 + }, + { + "epoch": 0.14, + "grad_norm": 0.259765625, + "learning_rate": 0.0001988527441021772, + "loss": 0.9978, + "step": 2500 + }, + { + "epoch": 0.14, + "grad_norm": 0.25390625, + "learning_rate": 0.0001988375682826801, + "loss": 0.9191, + "step": 2505 + }, + { + "epoch": 0.14, + "grad_norm": 0.255859375, + "learning_rate": 0.00019882229333600932, + "loss": 0.9824, + "step": 2510 + }, + { + "epoch": 0.14, + "grad_norm": 0.259765625, + "learning_rate": 0.00019880691927748467, + "loss": 1.0422, + "step": 2515 + }, + { + "epoch": 0.14, + "grad_norm": 0.283203125, + "learning_rate": 0.00019879144612252515, + "loss": 0.938, + "step": 2520 + }, + { + "epoch": 0.14, + "grad_norm": 0.251953125, + "learning_rate": 0.00019877587388664934, + "loss": 0.9222, + "step": 2525 + }, + { + "epoch": 0.15, + "grad_norm": 0.265625, + "learning_rate": 0.000198760202585475, + "loss": 0.9571, + "step": 2530 + }, + { + "epoch": 0.15, + "grad_norm": 0.25390625, + "learning_rate": 0.00019874443223471945, + "loss": 0.968, + "step": 2535 + }, + { + "epoch": 0.15, + "grad_norm": 0.23828125, + "learning_rate": 0.0001987285628501992, + "loss": 0.9952, + "step": 2540 + }, + { + "epoch": 0.15, + "grad_norm": 0.25390625, + "learning_rate": 0.00019871259444783016, + "loss": 0.98, + "step": 2545 + }, + { + "epoch": 0.15, + "grad_norm": 0.24609375, + "learning_rate": 0.0001986965270436275, + "loss": 0.9747, + "step": 2550 + }, + { + "epoch": 0.15, + "grad_norm": 0.244140625, + "learning_rate": 0.00019868036065370567, + "loss": 0.9699, + "step": 2555 + }, + { + "epoch": 0.15, + "grad_norm": 0.248046875, + "learning_rate": 0.00019866409529427852, + "loss": 0.9908, + "step": 2560 + }, + { + "epoch": 0.15, + "grad_norm": 0.263671875, + "learning_rate": 0.00019864773098165898, + "loss": 1.029, + "step": 2565 + }, + { + "epoch": 0.15, + "grad_norm": 0.24609375, + "learning_rate": 0.00019863126773225935, + "loss": 0.9535, + "step": 2570 + }, + { + "epoch": 0.15, + "grad_norm": 0.265625, + "learning_rate": 0.00019861470556259113, + "loss": 0.9764, + "step": 2575 + }, + { + "epoch": 0.15, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019859804448926503, + "loss": 1.0262, + "step": 2580 + }, + { + "epoch": 0.15, + "grad_norm": 0.263671875, + "learning_rate": 0.0001985812845289909, + "loss": 0.9952, + "step": 2585 + }, + { + "epoch": 0.15, + "grad_norm": 0.2578125, + "learning_rate": 0.0001985644256985778, + "loss": 0.9349, + "step": 2590 + }, + { + "epoch": 0.15, + "grad_norm": 0.267578125, + "learning_rate": 0.000198547468014934, + "loss": 0.9153, + "step": 2595 + }, + { + "epoch": 0.15, + "grad_norm": 0.25390625, + "learning_rate": 0.00019853041149506687, + "loss": 0.9486, + "step": 2600 + }, + { + "epoch": 0.15, + "grad_norm": 0.26171875, + "learning_rate": 0.0001985132561560829, + "loss": 1.0409, + "step": 2605 + }, + { + "epoch": 0.15, + "grad_norm": 0.259765625, + "learning_rate": 0.00019849600201518768, + "loss": 1.0015, + "step": 2610 + }, + { + "epoch": 0.15, + "grad_norm": 0.259765625, + "learning_rate": 0.00019847864908968592, + "loss": 1.043, + "step": 2615 + }, + { + "epoch": 0.15, + "grad_norm": 0.255859375, + "learning_rate": 0.0001984611973969814, + "loss": 0.9974, + "step": 2620 + }, + { + "epoch": 0.15, + "grad_norm": 0.24609375, + "learning_rate": 0.0001984436469545769, + "loss": 0.9339, + "step": 2625 + }, + { + "epoch": 0.15, + "grad_norm": 0.23828125, + "learning_rate": 0.00019842599778007437, + "loss": 0.9445, + "step": 2630 + }, + { + "epoch": 0.15, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019840824989117464, + "loss": 0.9528, + "step": 2635 + }, + { + "epoch": 0.15, + "grad_norm": 0.275390625, + "learning_rate": 0.0001983904033056776, + "loss": 1.0025, + "step": 2640 + }, + { + "epoch": 0.15, + "grad_norm": 0.2578125, + "learning_rate": 0.00019837245804148218, + "loss": 0.9132, + "step": 2645 + }, + { + "epoch": 0.15, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019835441411658616, + "loss": 1.0168, + "step": 2650 + }, + { + "epoch": 0.15, + "grad_norm": 0.287109375, + "learning_rate": 0.00019833627154908635, + "loss": 1.026, + "step": 2655 + }, + { + "epoch": 0.15, + "grad_norm": 0.298828125, + "learning_rate": 0.00019831803035717853, + "loss": 0.986, + "step": 2660 + }, + { + "epoch": 0.15, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019829969055915728, + "loss": 0.9764, + "step": 2665 + }, + { + "epoch": 0.15, + "grad_norm": 0.26171875, + "learning_rate": 0.00019828125217341618, + "loss": 0.9559, + "step": 2670 + }, + { + "epoch": 0.15, + "grad_norm": 0.26953125, + "learning_rate": 0.0001982627152184476, + "loss": 0.8965, + "step": 2675 + }, + { + "epoch": 0.15, + "grad_norm": 0.244140625, + "learning_rate": 0.00019824407971284286, + "loss": 0.9161, + "step": 2680 + }, + { + "epoch": 0.15, + "grad_norm": 0.23828125, + "learning_rate": 0.000198225345675292, + "loss": 0.9899, + "step": 2685 + }, + { + "epoch": 0.15, + "grad_norm": 0.2578125, + "learning_rate": 0.00019820651312458403, + "loss": 0.9215, + "step": 2690 + }, + { + "epoch": 0.15, + "grad_norm": 0.2578125, + "learning_rate": 0.00019818758207960663, + "loss": 0.9574, + "step": 2695 + }, + { + "epoch": 0.15, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019816855255934635, + "loss": 1.0076, + "step": 2700 + }, + { + "epoch": 0.16, + "grad_norm": 0.2578125, + "learning_rate": 0.0001981494245828884, + "loss": 1.008, + "step": 2705 + }, + { + "epoch": 0.16, + "grad_norm": 0.255859375, + "learning_rate": 0.00019813019816941689, + "loss": 1.0307, + "step": 2710 + }, + { + "epoch": 0.16, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019811087333821453, + "loss": 0.9577, + "step": 2715 + }, + { + "epoch": 0.16, + "grad_norm": 0.255859375, + "learning_rate": 0.00019809145010866276, + "loss": 0.9487, + "step": 2720 + }, + { + "epoch": 0.16, + "grad_norm": 0.265625, + "learning_rate": 0.00019807192850024175, + "loss": 0.9503, + "step": 2725 + }, + { + "epoch": 0.16, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019805230853253033, + "loss": 0.9102, + "step": 2730 + }, + { + "epoch": 0.16, + "grad_norm": 0.255859375, + "learning_rate": 0.00019803259022520587, + "loss": 0.9899, + "step": 2735 + }, + { + "epoch": 0.16, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001980127735980445, + "loss": 0.9321, + "step": 2740 + }, + { + "epoch": 0.16, + "grad_norm": 0.255859375, + "learning_rate": 0.00019799285867092096, + "loss": 0.9606, + "step": 2745 + }, + { + "epoch": 0.16, + "grad_norm": 0.30859375, + "learning_rate": 0.00019797284546380848, + "loss": 0.9441, + "step": 2750 + }, + { + "epoch": 0.16, + "grad_norm": 0.25390625, + "learning_rate": 0.00019795273399677893, + "loss": 1.0307, + "step": 2755 + }, + { + "epoch": 0.16, + "grad_norm": 0.240234375, + "learning_rate": 0.00019793252429000266, + "loss": 0.9548, + "step": 2760 + }, + { + "epoch": 0.16, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019791221636374865, + "loss": 0.9304, + "step": 2765 + }, + { + "epoch": 0.16, + "grad_norm": 0.240234375, + "learning_rate": 0.0001978918102383843, + "loss": 0.952, + "step": 2770 + }, + { + "epoch": 0.16, + "grad_norm": 0.271484375, + "learning_rate": 0.00019787130593437553, + "loss": 1.0108, + "step": 2775 + }, + { + "epoch": 0.16, + "grad_norm": 0.25390625, + "learning_rate": 0.00019785070347228673, + "loss": 0.951, + "step": 2780 + }, + { + "epoch": 0.16, + "grad_norm": 0.251953125, + "learning_rate": 0.00019783000287278078, + "loss": 1.0625, + "step": 2785 + }, + { + "epoch": 0.16, + "grad_norm": 0.25, + "learning_rate": 0.00019780920415661882, + "loss": 0.9996, + "step": 2790 + }, + { + "epoch": 0.16, + "grad_norm": 0.259765625, + "learning_rate": 0.0001977883073446606, + "loss": 0.9658, + "step": 2795 + }, + { + "epoch": 0.16, + "grad_norm": 0.2421875, + "learning_rate": 0.00019776731245786414, + "loss": 1.027, + "step": 2800 + }, + { + "epoch": 0.16, + "grad_norm": 0.236328125, + "learning_rate": 0.00019774621951728583, + "loss": 0.9845, + "step": 2805 + }, + { + "epoch": 0.16, + "grad_norm": 0.24609375, + "learning_rate": 0.00019772502854408042, + "loss": 0.9474, + "step": 2810 + }, + { + "epoch": 0.16, + "grad_norm": 0.251953125, + "learning_rate": 0.000197703739559501, + "loss": 0.9708, + "step": 2815 + }, + { + "epoch": 0.16, + "grad_norm": 0.25, + "learning_rate": 0.00019768235258489888, + "loss": 0.9832, + "step": 2820 + }, + { + "epoch": 0.16, + "grad_norm": 0.259765625, + "learning_rate": 0.00019766086764172377, + "loss": 0.9671, + "step": 2825 + }, + { + "epoch": 0.16, + "grad_norm": 0.248046875, + "learning_rate": 0.00019763928475152352, + "loss": 1.004, + "step": 2830 + }, + { + "epoch": 0.16, + "grad_norm": 0.23828125, + "learning_rate": 0.00019761760393594425, + "loss": 0.9782, + "step": 2835 + }, + { + "epoch": 0.16, + "grad_norm": 0.265625, + "learning_rate": 0.00019759582521673035, + "loss": 0.951, + "step": 2840 + }, + { + "epoch": 0.16, + "grad_norm": 0.2578125, + "learning_rate": 0.00019757394861572432, + "loss": 0.9236, + "step": 2845 + }, + { + "epoch": 0.16, + "grad_norm": 0.251953125, + "learning_rate": 0.00019755197415486685, + "loss": 1.0101, + "step": 2850 + }, + { + "epoch": 0.16, + "grad_norm": 0.240234375, + "learning_rate": 0.00019752990185619682, + "loss": 0.9481, + "step": 2855 + }, + { + "epoch": 0.16, + "grad_norm": 0.244140625, + "learning_rate": 0.00019750773174185123, + "loss": 0.9601, + "step": 2860 + }, + { + "epoch": 0.16, + "grad_norm": 0.255859375, + "learning_rate": 0.00019748546383406508, + "loss": 0.9757, + "step": 2865 + }, + { + "epoch": 0.16, + "grad_norm": 0.26171875, + "learning_rate": 0.00019746309815517153, + "loss": 0.9169, + "step": 2870 + }, + { + "epoch": 0.16, + "grad_norm": 0.23828125, + "learning_rate": 0.0001974406347276019, + "loss": 0.9888, + "step": 2875 + }, + { + "epoch": 0.17, + "grad_norm": 0.25, + "learning_rate": 0.0001974180735738853, + "loss": 0.9517, + "step": 2880 + }, + { + "epoch": 0.17, + "grad_norm": 0.25, + "learning_rate": 0.00019739541471664907, + "loss": 0.9753, + "step": 2885 + }, + { + "epoch": 0.17, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019737265817861845, + "loss": 0.9833, + "step": 2890 + }, + { + "epoch": 0.17, + "grad_norm": 0.265625, + "learning_rate": 0.00019734980398261666, + "loss": 1.0542, + "step": 2895 + }, + { + "epoch": 0.17, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019732685215156483, + "loss": 0.9272, + "step": 2900 + }, + { + "epoch": 0.17, + "grad_norm": 0.294921875, + "learning_rate": 0.00019730380270848209, + "loss": 0.8694, + "step": 2905 + }, + { + "epoch": 0.17, + "grad_norm": 0.287109375, + "learning_rate": 0.00019728065567648536, + "loss": 0.9686, + "step": 2910 + }, + { + "epoch": 0.17, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019725741107878958, + "loss": 0.9636, + "step": 2915 + }, + { + "epoch": 0.17, + "grad_norm": 0.265625, + "learning_rate": 0.00019723406893870738, + "loss": 0.9482, + "step": 2920 + }, + { + "epoch": 0.17, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001972106292796493, + "loss": 0.949, + "step": 2925 + }, + { + "epoch": 0.17, + "grad_norm": 0.25, + "learning_rate": 0.00019718709212512373, + "loss": 0.9562, + "step": 2930 + }, + { + "epoch": 0.17, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019716345749873674, + "loss": 0.9978, + "step": 2935 + }, + { + "epoch": 0.17, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019713972542419227, + "loss": 0.9335, + "step": 2940 + }, + { + "epoch": 0.17, + "grad_norm": 0.279296875, + "learning_rate": 0.00019711589592529187, + "loss": 1.0099, + "step": 2945 + }, + { + "epoch": 0.17, + "grad_norm": 0.384765625, + "learning_rate": 0.0001970919690259349, + "loss": 0.9736, + "step": 2950 + }, + { + "epoch": 0.17, + "grad_norm": 0.263671875, + "learning_rate": 0.00019706794475011835, + "loss": 1.0408, + "step": 2955 + }, + { + "epoch": 0.17, + "grad_norm": 0.267578125, + "learning_rate": 0.00019704382312193687, + "loss": 0.983, + "step": 2960 + }, + { + "epoch": 0.17, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019701960416558282, + "loss": 0.9751, + "step": 2965 + }, + { + "epoch": 0.17, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001969952879053461, + "loss": 0.9368, + "step": 2970 + }, + { + "epoch": 0.17, + "grad_norm": 0.267578125, + "learning_rate": 0.00019697087436561418, + "loss": 0.9285, + "step": 2975 + }, + { + "epoch": 0.17, + "grad_norm": 0.263671875, + "learning_rate": 0.0001969463635708722, + "loss": 0.9645, + "step": 2980 + }, + { + "epoch": 0.17, + "grad_norm": 0.25, + "learning_rate": 0.0001969217555457027, + "loss": 0.9815, + "step": 2985 + }, + { + "epoch": 0.17, + "grad_norm": 0.267578125, + "learning_rate": 0.00019689705031478586, + "loss": 1.0308, + "step": 2990 + }, + { + "epoch": 0.17, + "grad_norm": 0.271484375, + "learning_rate": 0.00019687224790289933, + "loss": 0.9747, + "step": 2995 + }, + { + "epoch": 0.17, + "grad_norm": 0.2578125, + "learning_rate": 0.00019684734833491811, + "loss": 0.9034, + "step": 3000 + }, + { + "epoch": 0.17, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001968223516358148, + "loss": 0.963, + "step": 3005 + }, + { + "epoch": 0.17, + "grad_norm": 0.240234375, + "learning_rate": 0.0001967972578306593, + "loss": 1.0035, + "step": 3010 + }, + { + "epoch": 0.17, + "grad_norm": 0.240234375, + "learning_rate": 0.00019677206694461896, + "loss": 0.9707, + "step": 3015 + }, + { + "epoch": 0.17, + "grad_norm": 0.26953125, + "learning_rate": 0.0001967467790029585, + "loss": 1.0332, + "step": 3020 + }, + { + "epoch": 0.17, + "grad_norm": 0.25390625, + "learning_rate": 0.0001967213940310399, + "loss": 1.0415, + "step": 3025 + }, + { + "epoch": 0.17, + "grad_norm": 0.259765625, + "learning_rate": 0.00019669591205432254, + "loss": 0.9611, + "step": 3030 + }, + { + "epoch": 0.17, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001966703330983631, + "loss": 0.9171, + "step": 3035 + }, + { + "epoch": 0.17, + "grad_norm": 0.283203125, + "learning_rate": 0.00019664465718881543, + "loss": 0.9974, + "step": 3040 + }, + { + "epoch": 0.17, + "grad_norm": 0.248046875, + "learning_rate": 0.00019661888435143073, + "loss": 0.9486, + "step": 3045 + }, + { + "epoch": 0.18, + "grad_norm": 0.244140625, + "learning_rate": 0.00019659301461205728, + "loss": 0.9646, + "step": 3050 + }, + { + "epoch": 0.18, + "grad_norm": 0.259765625, + "learning_rate": 0.0001965670479966407, + "loss": 0.964, + "step": 3055 + }, + { + "epoch": 0.18, + "grad_norm": 0.2734375, + "learning_rate": 0.0001965409845312236, + "loss": 0.9701, + "step": 3060 + }, + { + "epoch": 0.18, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001965148242419459, + "loss": 0.9235, + "step": 3065 + }, + { + "epoch": 0.18, + "grad_norm": 0.251953125, + "learning_rate": 0.0001964885671550445, + "loss": 0.9457, + "step": 3070 + }, + { + "epoch": 0.18, + "grad_norm": 0.25390625, + "learning_rate": 0.00019646221329685344, + "loss": 0.9532, + "step": 3075 + }, + { + "epoch": 0.18, + "grad_norm": 0.279296875, + "learning_rate": 0.0001964357626938038, + "loss": 0.9198, + "step": 3080 + }, + { + "epoch": 0.18, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019640921537242365, + "loss": 0.984, + "step": 3085 + }, + { + "epoch": 0.18, + "grad_norm": 0.25390625, + "learning_rate": 0.00019638257135933814, + "loss": 0.932, + "step": 3090 + }, + { + "epoch": 0.18, + "grad_norm": 0.25, + "learning_rate": 0.00019635583068126935, + "loss": 1.0043, + "step": 3095 + }, + { + "epoch": 0.18, + "grad_norm": 0.30078125, + "learning_rate": 0.0001963289933650363, + "loss": 0.9815, + "step": 3100 + }, + { + "epoch": 0.18, + "grad_norm": 0.2734375, + "learning_rate": 0.0001963020594375549, + "loss": 0.9163, + "step": 3105 + }, + { + "epoch": 0.18, + "grad_norm": 0.279296875, + "learning_rate": 0.00019627502892583806, + "loss": 0.9536, + "step": 3110 + }, + { + "epoch": 0.18, + "grad_norm": 0.263671875, + "learning_rate": 0.00019624790185699548, + "loss": 0.954, + "step": 3115 + }, + { + "epoch": 0.18, + "grad_norm": 0.275390625, + "learning_rate": 0.0001962206782582337, + "loss": 0.9576, + "step": 3120 + }, + { + "epoch": 0.18, + "grad_norm": 0.244140625, + "learning_rate": 0.0001961933581568561, + "loss": 0.9739, + "step": 3125 + }, + { + "epoch": 0.18, + "grad_norm": 0.255859375, + "learning_rate": 0.00019616594158026283, + "loss": 0.8902, + "step": 3130 + }, + { + "epoch": 0.18, + "grad_norm": 0.244140625, + "learning_rate": 0.0001961384285559508, + "loss": 0.9157, + "step": 3135 + }, + { + "epoch": 0.18, + "grad_norm": 0.263671875, + "learning_rate": 0.0001961108191115136, + "loss": 0.9462, + "step": 3140 + }, + { + "epoch": 0.18, + "grad_norm": 0.2734375, + "learning_rate": 0.00019608311327464167, + "loss": 0.9893, + "step": 3145 + }, + { + "epoch": 0.18, + "grad_norm": 0.4453125, + "learning_rate": 0.00019605531107312195, + "loss": 0.9836, + "step": 3150 + }, + { + "epoch": 0.18, + "grad_norm": 0.26953125, + "learning_rate": 0.00019602741253483817, + "loss": 0.8958, + "step": 3155 + }, + { + "epoch": 0.18, + "grad_norm": 0.2578125, + "learning_rate": 0.00019599941768777055, + "loss": 0.9296, + "step": 3160 + }, + { + "epoch": 0.18, + "grad_norm": 0.263671875, + "learning_rate": 0.000195971326559996, + "loss": 0.9593, + "step": 3165 + }, + { + "epoch": 0.18, + "grad_norm": 0.265625, + "learning_rate": 0.00019594313917968795, + "loss": 0.8882, + "step": 3170 + }, + { + "epoch": 0.18, + "grad_norm": 0.265625, + "learning_rate": 0.00019591485557511636, + "loss": 1.0049, + "step": 3175 + }, + { + "epoch": 0.18, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019588647577464775, + "loss": 0.9233, + "step": 3180 + }, + { + "epoch": 0.18, + "grad_norm": 0.2578125, + "learning_rate": 0.000195857999806745, + "loss": 0.9671, + "step": 3185 + }, + { + "epoch": 0.18, + "grad_norm": 0.267578125, + "learning_rate": 0.0001958294276999676, + "loss": 0.9873, + "step": 3190 + }, + { + "epoch": 0.18, + "grad_norm": 0.26171875, + "learning_rate": 0.00019580075948297135, + "loss": 0.9511, + "step": 3195 + }, + { + "epoch": 0.18, + "grad_norm": 0.353515625, + "learning_rate": 0.00019577199518450847, + "loss": 1.0246, + "step": 3200 + }, + { + "epoch": 0.18, + "grad_norm": 0.259765625, + "learning_rate": 0.00019574313483342748, + "loss": 0.9028, + "step": 3205 + }, + { + "epoch": 0.18, + "grad_norm": 0.267578125, + "learning_rate": 0.00019571417845867337, + "loss": 0.9715, + "step": 3210 + }, + { + "epoch": 0.18, + "grad_norm": 0.24609375, + "learning_rate": 0.00019568512608928736, + "loss": 1.0416, + "step": 3215 + }, + { + "epoch": 0.18, + "grad_norm": 0.267578125, + "learning_rate": 0.00019565597775440688, + "loss": 0.9841, + "step": 3220 + }, + { + "epoch": 0.19, + "grad_norm": 0.279296875, + "learning_rate": 0.00019562673348326573, + "loss": 0.9476, + "step": 3225 + }, + { + "epoch": 0.19, + "grad_norm": 0.255859375, + "learning_rate": 0.00019559739330519388, + "loss": 0.9602, + "step": 3230 + }, + { + "epoch": 0.19, + "grad_norm": 0.2734375, + "learning_rate": 0.00019556795724961742, + "loss": 0.9761, + "step": 3235 + }, + { + "epoch": 0.19, + "grad_norm": 0.259765625, + "learning_rate": 0.00019553842534605868, + "loss": 0.9506, + "step": 3240 + }, + { + "epoch": 0.19, + "grad_norm": 0.23046875, + "learning_rate": 0.00019550879762413615, + "loss": 0.9009, + "step": 3245 + }, + { + "epoch": 0.19, + "grad_norm": 0.267578125, + "learning_rate": 0.00019547907411356427, + "loss": 0.9218, + "step": 3250 + }, + { + "epoch": 0.19, + "grad_norm": 0.25, + "learning_rate": 0.00019544925484415372, + "loss": 0.9365, + "step": 3255 + }, + { + "epoch": 0.19, + "grad_norm": 0.259765625, + "learning_rate": 0.0001954193398458111, + "loss": 1.0058, + "step": 3260 + }, + { + "epoch": 0.19, + "grad_norm": 0.25390625, + "learning_rate": 0.0001953893291485391, + "loss": 0.9287, + "step": 3265 + }, + { + "epoch": 0.19, + "grad_norm": 0.259765625, + "learning_rate": 0.00019535922278243634, + "loss": 0.989, + "step": 3270 + }, + { + "epoch": 0.19, + "grad_norm": 0.265625, + "learning_rate": 0.00019532902077769735, + "loss": 0.9226, + "step": 3275 + }, + { + "epoch": 0.19, + "grad_norm": 0.283203125, + "learning_rate": 0.00019529872316461272, + "loss": 0.9946, + "step": 3280 + }, + { + "epoch": 0.19, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019526832997356875, + "loss": 1.0205, + "step": 3285 + }, + { + "epoch": 0.19, + "grad_norm": 0.244140625, + "learning_rate": 0.00019523784123504775, + "loss": 0.9722, + "step": 3290 + }, + { + "epoch": 0.19, + "grad_norm": 0.265625, + "learning_rate": 0.00019520725697962777, + "loss": 0.9066, + "step": 3295 + }, + { + "epoch": 0.19, + "grad_norm": 0.25390625, + "learning_rate": 0.00019517657723798268, + "loss": 0.9514, + "step": 3300 + }, + { + "epoch": 0.19, + "grad_norm": 0.251953125, + "learning_rate": 0.00019514580204088212, + "loss": 0.9312, + "step": 3305 + }, + { + "epoch": 0.19, + "grad_norm": 0.306640625, + "learning_rate": 0.00019511493141919145, + "loss": 0.9376, + "step": 3310 + }, + { + "epoch": 0.19, + "grad_norm": 0.28515625, + "learning_rate": 0.00019508396540387178, + "loss": 0.9941, + "step": 3315 + }, + { + "epoch": 0.19, + "grad_norm": 0.265625, + "learning_rate": 0.0001950529040259798, + "loss": 0.964, + "step": 3320 + }, + { + "epoch": 0.19, + "grad_norm": 0.251953125, + "learning_rate": 0.00019502174731666797, + "loss": 0.9354, + "step": 3325 + }, + { + "epoch": 0.19, + "grad_norm": 0.25390625, + "learning_rate": 0.00019499049530718424, + "loss": 0.8546, + "step": 3330 + }, + { + "epoch": 0.19, + "grad_norm": 0.26953125, + "learning_rate": 0.00019495914802887226, + "loss": 0.9735, + "step": 3335 + }, + { + "epoch": 0.19, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019492770551317106, + "loss": 0.9428, + "step": 3340 + }, + { + "epoch": 0.19, + "grad_norm": 0.236328125, + "learning_rate": 0.00019489616779161533, + "loss": 1.007, + "step": 3345 + }, + { + "epoch": 0.19, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019486453489583525, + "loss": 0.9552, + "step": 3350 + }, + { + "epoch": 0.19, + "grad_norm": 0.2578125, + "learning_rate": 0.0001948328068575563, + "loss": 0.9544, + "step": 3355 + }, + { + "epoch": 0.19, + "grad_norm": 0.255859375, + "learning_rate": 0.0001948009837085996, + "loss": 0.9965, + "step": 3360 + }, + { + "epoch": 0.19, + "grad_norm": 0.255859375, + "learning_rate": 0.00019476906548088148, + "loss": 0.9454, + "step": 3365 + }, + { + "epoch": 0.19, + "grad_norm": 0.283203125, + "learning_rate": 0.00019473705220641367, + "loss": 1.0041, + "step": 3370 + }, + { + "epoch": 0.19, + "grad_norm": 0.251953125, + "learning_rate": 0.0001947049439173033, + "loss": 0.9858, + "step": 3375 + }, + { + "epoch": 0.19, + "grad_norm": 0.267578125, + "learning_rate": 0.00019467274064575275, + "loss": 0.9584, + "step": 3380 + }, + { + "epoch": 0.19, + "grad_norm": 0.255859375, + "learning_rate": 0.0001946404424240596, + "loss": 0.9498, + "step": 3385 + }, + { + "epoch": 0.19, + "grad_norm": 0.271484375, + "learning_rate": 0.0001946080492846167, + "loss": 0.9957, + "step": 3390 + }, + { + "epoch": 0.19, + "grad_norm": 0.263671875, + "learning_rate": 0.00019457556125991216, + "loss": 0.9955, + "step": 3395 + }, + { + "epoch": 0.2, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019454297838252918, + "loss": 0.9109, + "step": 3400 + }, + { + "epoch": 0.2, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019451030068514609, + "loss": 0.9387, + "step": 3405 + }, + { + "epoch": 0.2, + "grad_norm": 0.25390625, + "learning_rate": 0.00019447752820053634, + "loss": 0.9458, + "step": 3410 + }, + { + "epoch": 0.2, + "grad_norm": 0.251953125, + "learning_rate": 0.00019444466096156846, + "loss": 0.9691, + "step": 3415 + }, + { + "epoch": 0.2, + "grad_norm": 0.28125, + "learning_rate": 0.00019441169900120598, + "loss": 0.9299, + "step": 3420 + }, + { + "epoch": 0.2, + "grad_norm": 0.275390625, + "learning_rate": 0.00019437864235250744, + "loss": 0.9265, + "step": 3425 + }, + { + "epoch": 0.2, + "grad_norm": 0.244140625, + "learning_rate": 0.00019434549104862639, + "loss": 1.0198, + "step": 3430 + }, + { + "epoch": 0.2, + "grad_norm": 0.265625, + "learning_rate": 0.0001943122451228112, + "loss": 0.9645, + "step": 3435 + }, + { + "epoch": 0.2, + "grad_norm": 0.23828125, + "learning_rate": 0.00019427890460840526, + "loss": 1.0366, + "step": 3440 + }, + { + "epoch": 0.2, + "grad_norm": 0.2578125, + "learning_rate": 0.00019424546953884675, + "loss": 0.9585, + "step": 3445 + }, + { + "epoch": 0.2, + "grad_norm": 0.255859375, + "learning_rate": 0.00019421193994766873, + "loss": 0.9712, + "step": 3450 + }, + { + "epoch": 0.2, + "grad_norm": 0.248046875, + "learning_rate": 0.000194178315868499, + "loss": 1.0027, + "step": 3455 + }, + { + "epoch": 0.2, + "grad_norm": 0.265625, + "learning_rate": 0.00019414459733506023, + "loss": 0.9572, + "step": 3460 + }, + { + "epoch": 0.2, + "grad_norm": 0.251953125, + "learning_rate": 0.00019411078438116969, + "loss": 0.9517, + "step": 3465 + }, + { + "epoch": 0.2, + "grad_norm": 0.259765625, + "learning_rate": 0.00019407687704073943, + "loss": 0.9082, + "step": 3470 + }, + { + "epoch": 0.2, + "grad_norm": 0.263671875, + "learning_rate": 0.00019404287534777615, + "loss": 0.9686, + "step": 3475 + }, + { + "epoch": 0.2, + "grad_norm": 0.25390625, + "learning_rate": 0.00019400877933638114, + "loss": 0.9654, + "step": 3480 + }, + { + "epoch": 0.2, + "grad_norm": 0.255859375, + "learning_rate": 0.00019397458904075036, + "loss": 1.0102, + "step": 3485 + }, + { + "epoch": 0.2, + "grad_norm": 0.26171875, + "learning_rate": 0.00019394030449517428, + "loss": 0.8971, + "step": 3490 + }, + { + "epoch": 0.2, + "grad_norm": 0.263671875, + "learning_rate": 0.00019390592573403787, + "loss": 1.029, + "step": 3495 + }, + { + "epoch": 0.2, + "grad_norm": 0.275390625, + "learning_rate": 0.0001938714527918207, + "loss": 0.9432, + "step": 3500 + }, + { + "epoch": 0.2, + "grad_norm": 0.25, + "learning_rate": 0.0001938368857030966, + "loss": 1.0492, + "step": 3505 + }, + { + "epoch": 0.2, + "grad_norm": 0.25390625, + "learning_rate": 0.00019380222450253405, + "loss": 1.0287, + "step": 3510 + }, + { + "epoch": 0.2, + "grad_norm": 0.259765625, + "learning_rate": 0.00019376746922489577, + "loss": 0.9945, + "step": 3515 + }, + { + "epoch": 0.2, + "grad_norm": 0.271484375, + "learning_rate": 0.00019373261990503888, + "loss": 0.9362, + "step": 3520 + }, + { + "epoch": 0.2, + "grad_norm": 0.27734375, + "learning_rate": 0.00019369767657791479, + "loss": 0.9723, + "step": 3525 + }, + { + "epoch": 0.2, + "grad_norm": 0.251953125, + "learning_rate": 0.00019366263927856928, + "loss": 0.986, + "step": 3530 + }, + { + "epoch": 0.2, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019362750804214222, + "loss": 0.9137, + "step": 3535 + }, + { + "epoch": 0.2, + "grad_norm": 0.259765625, + "learning_rate": 0.0001935922829038679, + "loss": 0.8947, + "step": 3540 + }, + { + "epoch": 0.2, + "grad_norm": 0.251953125, + "learning_rate": 0.00019355696389907455, + "loss": 0.9909, + "step": 3545 + }, + { + "epoch": 0.2, + "grad_norm": 0.275390625, + "learning_rate": 0.00019352155106318471, + "loss": 0.9157, + "step": 3550 + }, + { + "epoch": 0.2, + "grad_norm": 0.25390625, + "learning_rate": 0.00019348604443171502, + "loss": 0.9311, + "step": 3555 + }, + { + "epoch": 0.2, + "grad_norm": 0.26171875, + "learning_rate": 0.00019345044404027613, + "loss": 1.043, + "step": 3560 + }, + { + "epoch": 0.2, + "grad_norm": 0.26171875, + "learning_rate": 0.0001934147499245727, + "loss": 0.9324, + "step": 3565 + }, + { + "epoch": 0.2, + "grad_norm": 0.2734375, + "learning_rate": 0.0001933789621204035, + "loss": 0.9573, + "step": 3570 + }, + { + "epoch": 0.21, + "grad_norm": 0.251953125, + "learning_rate": 0.00019334308066366114, + "loss": 0.9804, + "step": 3575 + }, + { + "epoch": 0.21, + "grad_norm": 0.27734375, + "learning_rate": 0.00019330710559033225, + "loss": 0.948, + "step": 3580 + }, + { + "epoch": 0.21, + "grad_norm": 0.2578125, + "learning_rate": 0.0001932710369364973, + "loss": 0.9988, + "step": 3585 + }, + { + "epoch": 0.21, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019323487473833062, + "loss": 0.9151, + "step": 3590 + }, + { + "epoch": 0.21, + "grad_norm": 0.26171875, + "learning_rate": 0.0001931986190321004, + "loss": 0.9649, + "step": 3595 + }, + { + "epoch": 0.21, + "grad_norm": 0.25390625, + "learning_rate": 0.00019316226985416853, + "loss": 0.9589, + "step": 3600 + }, + { + "epoch": 0.21, + "grad_norm": 0.283203125, + "learning_rate": 0.00019312582724099076, + "loss": 0.9673, + "step": 3605 + }, + { + "epoch": 0.21, + "grad_norm": 0.24609375, + "learning_rate": 0.00019308929122911642, + "loss": 0.8674, + "step": 3610 + }, + { + "epoch": 0.21, + "grad_norm": 0.26953125, + "learning_rate": 0.0001930526618551886, + "loss": 1.0047, + "step": 3615 + }, + { + "epoch": 0.21, + "grad_norm": 0.251953125, + "learning_rate": 0.00019301593915594403, + "loss": 0.9402, + "step": 3620 + }, + { + "epoch": 0.21, + "grad_norm": 0.259765625, + "learning_rate": 0.00019297912316821298, + "loss": 0.9739, + "step": 3625 + }, + { + "epoch": 0.21, + "grad_norm": 0.259765625, + "learning_rate": 0.00019294221392891932, + "loss": 0.9754, + "step": 3630 + }, + { + "epoch": 0.21, + "grad_norm": 0.265625, + "learning_rate": 0.00019290521147508042, + "loss": 0.9692, + "step": 3635 + }, + { + "epoch": 0.21, + "grad_norm": 0.263671875, + "learning_rate": 0.0001928681158438072, + "loss": 1.0017, + "step": 3640 + }, + { + "epoch": 0.21, + "grad_norm": 0.25390625, + "learning_rate": 0.00019283092707230392, + "loss": 1.0303, + "step": 3645 + }, + { + "epoch": 0.21, + "grad_norm": 0.271484375, + "learning_rate": 0.0001927936451978684, + "loss": 1.0064, + "step": 3650 + }, + { + "epoch": 0.21, + "grad_norm": 0.2578125, + "learning_rate": 0.0001927562702578917, + "loss": 0.9485, + "step": 3655 + }, + { + "epoch": 0.21, + "grad_norm": 0.263671875, + "learning_rate": 0.00019271880228985828, + "loss": 0.9458, + "step": 3660 + }, + { + "epoch": 0.21, + "grad_norm": 0.26171875, + "learning_rate": 0.00019268124133134588, + "loss": 0.9705, + "step": 3665 + }, + { + "epoch": 0.21, + "grad_norm": 0.2578125, + "learning_rate": 0.00019264358742002556, + "loss": 0.9049, + "step": 3670 + }, + { + "epoch": 0.21, + "grad_norm": 0.26171875, + "learning_rate": 0.00019260584059366153, + "loss": 1.0574, + "step": 3675 + }, + { + "epoch": 0.21, + "grad_norm": 0.267578125, + "learning_rate": 0.00019256800089011123, + "loss": 1.0533, + "step": 3680 + }, + { + "epoch": 0.21, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001925300683473252, + "loss": 0.9216, + "step": 3685 + }, + { + "epoch": 0.21, + "grad_norm": 0.275390625, + "learning_rate": 0.0001924920430033472, + "loss": 0.9334, + "step": 3690 + }, + { + "epoch": 0.21, + "grad_norm": 0.251953125, + "learning_rate": 0.00019245392489631392, + "loss": 0.9523, + "step": 3695 + }, + { + "epoch": 0.21, + "grad_norm": 0.26171875, + "learning_rate": 0.00019241571406445525, + "loss": 0.9449, + "step": 3700 + }, + { + "epoch": 0.21, + "grad_norm": 0.26171875, + "learning_rate": 0.00019237741054609387, + "loss": 0.9897, + "step": 3705 + }, + { + "epoch": 0.21, + "grad_norm": 0.259765625, + "learning_rate": 0.00019233901437964562, + "loss": 0.9448, + "step": 3710 + }, + { + "epoch": 0.21, + "grad_norm": 0.244140625, + "learning_rate": 0.00019230052560361913, + "loss": 0.9926, + "step": 3715 + }, + { + "epoch": 0.21, + "grad_norm": 0.28125, + "learning_rate": 0.00019226194425661598, + "loss": 0.9142, + "step": 3720 + }, + { + "epoch": 0.21, + "grad_norm": 0.279296875, + "learning_rate": 0.00019222327037733052, + "loss": 1.0046, + "step": 3725 + }, + { + "epoch": 0.21, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019218450400454998, + "loss": 0.9166, + "step": 3730 + }, + { + "epoch": 0.21, + "grad_norm": 0.283203125, + "learning_rate": 0.00019214564517715433, + "loss": 0.9597, + "step": 3735 + }, + { + "epoch": 0.21, + "grad_norm": 0.265625, + "learning_rate": 0.00019210669393411624, + "loss": 0.9981, + "step": 3740 + }, + { + "epoch": 0.21, + "grad_norm": 0.263671875, + "learning_rate": 0.00019206765031450112, + "loss": 0.9568, + "step": 3745 + }, + { + "epoch": 0.22, + "grad_norm": 0.3125, + "learning_rate": 0.00019202851435746695, + "loss": 0.9513, + "step": 3750 + }, + { + "epoch": 0.22, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019198928610226435, + "loss": 0.943, + "step": 3755 + }, + { + "epoch": 0.22, + "grad_norm": 0.2734375, + "learning_rate": 0.00019194996558823655, + "loss": 1.0194, + "step": 3760 + }, + { + "epoch": 0.22, + "grad_norm": 0.265625, + "learning_rate": 0.00019191055285481927, + "loss": 1.0096, + "step": 3765 + }, + { + "epoch": 0.22, + "grad_norm": 0.26171875, + "learning_rate": 0.00019187104794154074, + "loss": 0.9833, + "step": 3770 + }, + { + "epoch": 0.22, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019183145088802158, + "loss": 0.9513, + "step": 3775 + }, + { + "epoch": 0.22, + "grad_norm": 0.259765625, + "learning_rate": 0.00019179176173397494, + "loss": 0.9946, + "step": 3780 + }, + { + "epoch": 0.22, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001917519805192062, + "loss": 0.9526, + "step": 3785 + }, + { + "epoch": 0.22, + "grad_norm": 0.263671875, + "learning_rate": 0.00019171210728361317, + "loss": 1.0067, + "step": 3790 + }, + { + "epoch": 0.22, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019167214206718594, + "loss": 0.9788, + "step": 3795 + }, + { + "epoch": 0.22, + "grad_norm": 0.2734375, + "learning_rate": 0.0001916320849100068, + "loss": 0.9777, + "step": 3800 + }, + { + "epoch": 0.22, + "grad_norm": 0.2734375, + "learning_rate": 0.00019159193585225026, + "loss": 0.9702, + "step": 3805 + }, + { + "epoch": 0.22, + "grad_norm": 0.26171875, + "learning_rate": 0.00019155169493418304, + "loss": 0.952, + "step": 3810 + }, + { + "epoch": 0.22, + "grad_norm": 0.255859375, + "learning_rate": 0.000191511362196164, + "loss": 0.9244, + "step": 3815 + }, + { + "epoch": 0.22, + "grad_norm": 0.240234375, + "learning_rate": 0.00019147093767864402, + "loss": 0.9819, + "step": 3820 + }, + { + "epoch": 0.22, + "grad_norm": 0.255859375, + "learning_rate": 0.00019143042142216607, + "loss": 0.9759, + "step": 3825 + }, + { + "epoch": 0.22, + "grad_norm": 0.2578125, + "learning_rate": 0.00019138981346736514, + "loss": 0.8902, + "step": 3830 + }, + { + "epoch": 0.22, + "grad_norm": 0.251953125, + "learning_rate": 0.00019134911385496815, + "loss": 0.9328, + "step": 3835 + }, + { + "epoch": 0.22, + "grad_norm": 0.255859375, + "learning_rate": 0.00019130832262579398, + "loss": 0.8924, + "step": 3840 + }, + { + "epoch": 0.22, + "grad_norm": 0.26953125, + "learning_rate": 0.00019126743982075337, + "loss": 0.9471, + "step": 3845 + }, + { + "epoch": 0.22, + "grad_norm": 0.26171875, + "learning_rate": 0.00019122646548084892, + "loss": 0.9705, + "step": 3850 + }, + { + "epoch": 0.22, + "grad_norm": 0.263671875, + "learning_rate": 0.00019118539964717505, + "loss": 1.0149, + "step": 3855 + }, + { + "epoch": 0.22, + "grad_norm": 0.263671875, + "learning_rate": 0.0001911442423609179, + "loss": 0.9364, + "step": 3860 + }, + { + "epoch": 0.22, + "grad_norm": 0.291015625, + "learning_rate": 0.00019110299366335536, + "loss": 0.9572, + "step": 3865 + }, + { + "epoch": 0.22, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019106165359585698, + "loss": 0.9038, + "step": 3870 + }, + { + "epoch": 0.22, + "grad_norm": 0.2890625, + "learning_rate": 0.00019102022219988398, + "loss": 0.9761, + "step": 3875 + }, + { + "epoch": 0.22, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019097869951698913, + "loss": 1.0293, + "step": 3880 + }, + { + "epoch": 0.22, + "grad_norm": 0.263671875, + "learning_rate": 0.0001909370855888168, + "loss": 0.8963, + "step": 3885 + }, + { + "epoch": 0.22, + "grad_norm": 0.265625, + "learning_rate": 0.00019089538045710284, + "loss": 0.9691, + "step": 3890 + }, + { + "epoch": 0.22, + "grad_norm": 0.265625, + "learning_rate": 0.00019085358416367457, + "loss": 0.9661, + "step": 3895 + }, + { + "epoch": 0.22, + "grad_norm": 0.26953125, + "learning_rate": 0.0001908116967504508, + "loss": 0.9243, + "step": 3900 + }, + { + "epoch": 0.22, + "grad_norm": 0.271484375, + "learning_rate": 0.00019076971825944164, + "loss": 0.9582, + "step": 3905 + }, + { + "epoch": 0.22, + "grad_norm": 0.275390625, + "learning_rate": 0.00019072764873274856, + "loss": 0.973, + "step": 3910 + }, + { + "epoch": 0.22, + "grad_norm": 0.279296875, + "learning_rate": 0.0001906854882125644, + "loss": 0.9518, + "step": 3915 + }, + { + "epoch": 0.22, + "grad_norm": 0.279296875, + "learning_rate": 0.00019064323674117318, + "loss": 0.998, + "step": 3920 + }, + { + "epoch": 0.23, + "grad_norm": 0.25390625, + "learning_rate": 0.0001906008943609502, + "loss": 0.8808, + "step": 3925 + }, + { + "epoch": 0.23, + "grad_norm": 0.296875, + "learning_rate": 0.0001905584611143619, + "loss": 0.9755, + "step": 3930 + }, + { + "epoch": 0.23, + "grad_norm": 0.271484375, + "learning_rate": 0.00019051593704396587, + "loss": 0.9383, + "step": 3935 + }, + { + "epoch": 0.23, + "grad_norm": 0.279296875, + "learning_rate": 0.00019047332219241078, + "loss": 0.9663, + "step": 3940 + }, + { + "epoch": 0.23, + "grad_norm": 0.28125, + "learning_rate": 0.00019043061660243632, + "loss": 0.9642, + "step": 3945 + }, + { + "epoch": 0.23, + "grad_norm": 0.26171875, + "learning_rate": 0.00019038782031687325, + "loss": 1.0157, + "step": 3950 + }, + { + "epoch": 0.23, + "grad_norm": 0.259765625, + "learning_rate": 0.0001903449333786432, + "loss": 0.9732, + "step": 3955 + }, + { + "epoch": 0.23, + "grad_norm": 0.26171875, + "learning_rate": 0.00019030195583075881, + "loss": 0.9632, + "step": 3960 + }, + { + "epoch": 0.23, + "grad_norm": 0.259765625, + "learning_rate": 0.00019025888771632355, + "loss": 0.9818, + "step": 3965 + }, + { + "epoch": 0.23, + "grad_norm": 0.2734375, + "learning_rate": 0.00019021572907853177, + "loss": 0.9512, + "step": 3970 + }, + { + "epoch": 0.23, + "grad_norm": 0.251953125, + "learning_rate": 0.00019017247996066852, + "loss": 0.8931, + "step": 3975 + }, + { + "epoch": 0.23, + "grad_norm": 0.259765625, + "learning_rate": 0.00019012914040610963, + "loss": 1.0358, + "step": 3980 + }, + { + "epoch": 0.23, + "grad_norm": 0.26171875, + "learning_rate": 0.00019008571045832167, + "loss": 0.9163, + "step": 3985 + }, + { + "epoch": 0.23, + "grad_norm": 0.267578125, + "learning_rate": 0.00019004219016086188, + "loss": 0.9641, + "step": 3990 + }, + { + "epoch": 0.23, + "grad_norm": 0.25, + "learning_rate": 0.00018999857955737798, + "loss": 0.9079, + "step": 3995 + }, + { + "epoch": 0.23, + "grad_norm": 0.298828125, + "learning_rate": 0.00018995487869160845, + "loss": 0.9652, + "step": 4000 + }, + { + "epoch": 0.23, + "grad_norm": 0.2578125, + "learning_rate": 0.00018991108760738214, + "loss": 0.8911, + "step": 4005 + }, + { + "epoch": 0.23, + "grad_norm": 0.2578125, + "learning_rate": 0.00018986720634861848, + "loss": 0.9227, + "step": 4010 + }, + { + "epoch": 0.23, + "grad_norm": 0.26953125, + "learning_rate": 0.00018982323495932732, + "loss": 0.9767, + "step": 4015 + }, + { + "epoch": 0.23, + "grad_norm": 0.25390625, + "learning_rate": 0.00018977917348360888, + "loss": 0.9541, + "step": 4020 + }, + { + "epoch": 0.23, + "grad_norm": 0.275390625, + "learning_rate": 0.0001897350219656537, + "loss": 0.9738, + "step": 4025 + }, + { + "epoch": 0.23, + "grad_norm": 0.267578125, + "learning_rate": 0.0001896907804497427, + "loss": 0.9014, + "step": 4030 + }, + { + "epoch": 0.23, + "grad_norm": 0.255859375, + "learning_rate": 0.00018964644898024707, + "loss": 1.0062, + "step": 4035 + }, + { + "epoch": 0.23, + "grad_norm": 0.25, + "learning_rate": 0.0001896020276016281, + "loss": 0.9882, + "step": 4040 + }, + { + "epoch": 0.23, + "grad_norm": 0.25390625, + "learning_rate": 0.00018955751635843737, + "loss": 0.9511, + "step": 4045 + }, + { + "epoch": 0.23, + "grad_norm": 0.255859375, + "learning_rate": 0.0001895129152953165, + "loss": 0.8994, + "step": 4050 + }, + { + "epoch": 0.23, + "grad_norm": 0.2578125, + "learning_rate": 0.00018946822445699735, + "loss": 0.9227, + "step": 4055 + }, + { + "epoch": 0.23, + "grad_norm": 0.259765625, + "learning_rate": 0.00018942344388830158, + "loss": 0.9347, + "step": 4060 + }, + { + "epoch": 0.23, + "grad_norm": 0.251953125, + "learning_rate": 0.00018937857363414106, + "loss": 0.9333, + "step": 4065 + }, + { + "epoch": 0.23, + "grad_norm": 0.251953125, + "learning_rate": 0.00018933361373951746, + "loss": 0.9831, + "step": 4070 + }, + { + "epoch": 0.23, + "grad_norm": 0.28515625, + "learning_rate": 0.00018928856424952245, + "loss": 0.8975, + "step": 4075 + }, + { + "epoch": 0.23, + "grad_norm": 0.25, + "learning_rate": 0.0001892434252093375, + "loss": 1.0322, + "step": 4080 + }, + { + "epoch": 0.23, + "grad_norm": 0.2578125, + "learning_rate": 0.00018919819666423396, + "loss": 0.9409, + "step": 4085 + }, + { + "epoch": 0.23, + "grad_norm": 0.283203125, + "learning_rate": 0.00018915287865957277, + "loss": 1.0335, + "step": 4090 + }, + { + "epoch": 0.23, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001891074712408049, + "loss": 0.9877, + "step": 4095 + }, + { + "epoch": 0.24, + "grad_norm": 0.28515625, + "learning_rate": 0.00018906197445347068, + "loss": 0.9743, + "step": 4100 + }, + { + "epoch": 0.24, + "grad_norm": 0.259765625, + "learning_rate": 0.0001890163883432003, + "loss": 0.935, + "step": 4105 + }, + { + "epoch": 0.24, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018897071295571335, + "loss": 0.9519, + "step": 4110 + }, + { + "epoch": 0.24, + "grad_norm": 0.255859375, + "learning_rate": 0.00018892494833681913, + "loss": 0.9171, + "step": 4115 + }, + { + "epoch": 0.24, + "grad_norm": 0.26171875, + "learning_rate": 0.00018887909453241632, + "loss": 0.9584, + "step": 4120 + }, + { + "epoch": 0.24, + "grad_norm": 0.267578125, + "learning_rate": 0.0001888331515884931, + "loss": 0.9841, + "step": 4125 + }, + { + "epoch": 0.24, + "grad_norm": 0.265625, + "learning_rate": 0.000188787119551127, + "loss": 1.0196, + "step": 4130 + }, + { + "epoch": 0.24, + "grad_norm": 0.283203125, + "learning_rate": 0.00018874099846648496, + "loss": 0.9448, + "step": 4135 + }, + { + "epoch": 0.24, + "grad_norm": 0.251953125, + "learning_rate": 0.0001886947883808232, + "loss": 0.9824, + "step": 4140 + }, + { + "epoch": 0.24, + "grad_norm": 0.306640625, + "learning_rate": 0.0001886484893404872, + "loss": 0.9046, + "step": 4145 + }, + { + "epoch": 0.24, + "grad_norm": 0.25, + "learning_rate": 0.0001886021013919117, + "loss": 0.9443, + "step": 4150 + }, + { + "epoch": 0.24, + "grad_norm": 0.28125, + "learning_rate": 0.00018855562458162055, + "loss": 0.9724, + "step": 4155 + }, + { + "epoch": 0.24, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001885090589562267, + "loss": 0.9864, + "step": 4160 + }, + { + "epoch": 0.24, + "grad_norm": 0.26953125, + "learning_rate": 0.00018846240456243225, + "loss": 0.9661, + "step": 4165 + }, + { + "epoch": 0.24, + "grad_norm": 0.25390625, + "learning_rate": 0.00018841566144702833, + "loss": 0.9449, + "step": 4170 + }, + { + "epoch": 0.24, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018836882965689493, + "loss": 0.9158, + "step": 4175 + }, + { + "epoch": 0.24, + "grad_norm": 0.2421875, + "learning_rate": 0.00018832190923900112, + "loss": 0.9386, + "step": 4180 + }, + { + "epoch": 0.24, + "grad_norm": 0.279296875, + "learning_rate": 0.00018827490024040484, + "loss": 0.9941, + "step": 4185 + }, + { + "epoch": 0.24, + "grad_norm": 0.271484375, + "learning_rate": 0.00018822780270825277, + "loss": 0.948, + "step": 4190 + }, + { + "epoch": 0.24, + "grad_norm": 0.2578125, + "learning_rate": 0.00018818061668978046, + "loss": 1.0238, + "step": 4195 + }, + { + "epoch": 0.24, + "grad_norm": 0.275390625, + "learning_rate": 0.0001881333422323122, + "loss": 0.9964, + "step": 4200 + }, + { + "epoch": 0.24, + "grad_norm": 0.28515625, + "learning_rate": 0.00018808597938326093, + "loss": 0.9123, + "step": 4205 + }, + { + "epoch": 0.24, + "grad_norm": 0.259765625, + "learning_rate": 0.00018803852819012832, + "loss": 0.9153, + "step": 4210 + }, + { + "epoch": 0.24, + "grad_norm": 0.26171875, + "learning_rate": 0.0001879909887005046, + "loss": 0.956, + "step": 4215 + }, + { + "epoch": 0.24, + "grad_norm": 0.24609375, + "learning_rate": 0.00018794336096206852, + "loss": 0.9875, + "step": 4220 + }, + { + "epoch": 0.24, + "grad_norm": 0.25390625, + "learning_rate": 0.00018789564502258741, + "loss": 0.9332, + "step": 4225 + }, + { + "epoch": 0.24, + "grad_norm": 0.271484375, + "learning_rate": 0.000187847840929917, + "loss": 1.0428, + "step": 4230 + }, + { + "epoch": 0.24, + "grad_norm": 0.2578125, + "learning_rate": 0.00018779994873200146, + "loss": 0.8838, + "step": 4235 + }, + { + "epoch": 0.24, + "grad_norm": 0.265625, + "learning_rate": 0.00018775196847687332, + "loss": 0.9732, + "step": 4240 + }, + { + "epoch": 0.24, + "grad_norm": 0.283203125, + "learning_rate": 0.0001877039002126534, + "loss": 0.9697, + "step": 4245 + }, + { + "epoch": 0.24, + "grad_norm": 0.265625, + "learning_rate": 0.00018765574398755085, + "loss": 1.0307, + "step": 4250 + }, + { + "epoch": 0.24, + "grad_norm": 0.25, + "learning_rate": 0.00018760749984986298, + "loss": 0.9221, + "step": 4255 + }, + { + "epoch": 0.24, + "grad_norm": 0.255859375, + "learning_rate": 0.0001875591678479753, + "loss": 0.911, + "step": 4260 + }, + { + "epoch": 0.24, + "grad_norm": 0.26171875, + "learning_rate": 0.00018751074803036142, + "loss": 0.9499, + "step": 4265 + }, + { + "epoch": 0.25, + "grad_norm": 0.26171875, + "learning_rate": 0.00018746224044558302, + "loss": 0.931, + "step": 4270 + }, + { + "epoch": 0.25, + "grad_norm": 0.26953125, + "learning_rate": 0.0001874136451422898, + "loss": 0.9934, + "step": 4275 + }, + { + "epoch": 0.25, + "grad_norm": 0.2578125, + "learning_rate": 0.0001873649621692195, + "loss": 1.0263, + "step": 4280 + }, + { + "epoch": 0.25, + "grad_norm": 0.25390625, + "learning_rate": 0.00018731619157519774, + "loss": 0.9163, + "step": 4285 + }, + { + "epoch": 0.25, + "grad_norm": 0.259765625, + "learning_rate": 0.00018726733340913797, + "loss": 0.9006, + "step": 4290 + }, + { + "epoch": 0.25, + "grad_norm": 0.26953125, + "learning_rate": 0.00018721838772004157, + "loss": 0.9588, + "step": 4295 + }, + { + "epoch": 0.25, + "grad_norm": 0.263671875, + "learning_rate": 0.0001871693545569976, + "loss": 0.972, + "step": 4300 + }, + { + "epoch": 0.25, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018712023396918293, + "loss": 0.8951, + "step": 4305 + }, + { + "epoch": 0.25, + "grad_norm": 0.259765625, + "learning_rate": 0.0001870710260058621, + "loss": 0.9148, + "step": 4310 + }, + { + "epoch": 0.25, + "grad_norm": 0.2578125, + "learning_rate": 0.00018702173071638716, + "loss": 0.8892, + "step": 4315 + }, + { + "epoch": 0.25, + "grad_norm": 0.27734375, + "learning_rate": 0.00018697234815019792, + "loss": 1.0288, + "step": 4320 + }, + { + "epoch": 0.25, + "grad_norm": 0.283203125, + "learning_rate": 0.00018692287835682164, + "loss": 1.0217, + "step": 4325 + }, + { + "epoch": 0.25, + "grad_norm": 0.283203125, + "learning_rate": 0.00018687332138587302, + "loss": 0.9623, + "step": 4330 + }, + { + "epoch": 0.25, + "grad_norm": 0.263671875, + "learning_rate": 0.0001868236772870543, + "loss": 0.9821, + "step": 4335 + }, + { + "epoch": 0.25, + "grad_norm": 0.26171875, + "learning_rate": 0.00018677394611015498, + "loss": 0.9868, + "step": 4340 + }, + { + "epoch": 0.25, + "grad_norm": 0.28515625, + "learning_rate": 0.000186724127905052, + "loss": 0.9652, + "step": 4345 + }, + { + "epoch": 0.25, + "grad_norm": 0.2734375, + "learning_rate": 0.00018667422272170955, + "loss": 0.9466, + "step": 4350 + }, + { + "epoch": 0.25, + "grad_norm": 0.275390625, + "learning_rate": 0.00018662423061017896, + "loss": 1.0052, + "step": 4355 + }, + { + "epoch": 0.25, + "grad_norm": 0.259765625, + "learning_rate": 0.00018657415162059892, + "loss": 0.9959, + "step": 4360 + }, + { + "epoch": 0.25, + "grad_norm": 0.26953125, + "learning_rate": 0.0001865239858031951, + "loss": 1.0139, + "step": 4365 + }, + { + "epoch": 0.25, + "grad_norm": 0.267578125, + "learning_rate": 0.00018647373320828035, + "loss": 0.9677, + "step": 4370 + }, + { + "epoch": 0.25, + "grad_norm": 0.26171875, + "learning_rate": 0.00018642339388625444, + "loss": 0.9707, + "step": 4375 + }, + { + "epoch": 0.25, + "grad_norm": 0.263671875, + "learning_rate": 0.0001863729678876043, + "loss": 0.9591, + "step": 4380 + }, + { + "epoch": 0.25, + "grad_norm": 0.283203125, + "learning_rate": 0.00018632245526290352, + "loss": 1.0214, + "step": 4385 + }, + { + "epoch": 0.25, + "grad_norm": 0.265625, + "learning_rate": 0.0001862718560628129, + "loss": 0.9707, + "step": 4390 + }, + { + "epoch": 0.25, + "grad_norm": 0.26953125, + "learning_rate": 0.00018622117033807985, + "loss": 0.9585, + "step": 4395 + }, + { + "epoch": 0.25, + "grad_norm": 0.265625, + "learning_rate": 0.00018617039813953855, + "loss": 0.8711, + "step": 4400 + }, + { + "epoch": 0.25, + "grad_norm": 0.255859375, + "learning_rate": 0.00018611953951811004, + "loss": 1.0133, + "step": 4405 + }, + { + "epoch": 0.25, + "grad_norm": 0.248046875, + "learning_rate": 0.00018606859452480193, + "loss": 0.9349, + "step": 4410 + }, + { + "epoch": 0.25, + "grad_norm": 0.263671875, + "learning_rate": 0.00018601756321070845, + "loss": 0.979, + "step": 4415 + }, + { + "epoch": 0.25, + "grad_norm": 0.2734375, + "learning_rate": 0.0001859664456270105, + "loss": 0.9793, + "step": 4420 + }, + { + "epoch": 0.25, + "grad_norm": 0.25390625, + "learning_rate": 0.00018591524182497547, + "loss": 0.8638, + "step": 4425 + }, + { + "epoch": 0.25, + "grad_norm": 0.275390625, + "learning_rate": 0.0001858639518559571, + "loss": 0.9971, + "step": 4430 + }, + { + "epoch": 0.25, + "grad_norm": 0.2734375, + "learning_rate": 0.00018581257577139572, + "loss": 0.9375, + "step": 4435 + }, + { + "epoch": 0.25, + "grad_norm": 0.2734375, + "learning_rate": 0.00018576111362281794, + "loss": 1.0139, + "step": 4440 + }, + { + "epoch": 0.26, + "grad_norm": 0.25390625, + "learning_rate": 0.00018570956546183666, + "loss": 1.0013, + "step": 4445 + }, + { + "epoch": 0.26, + "grad_norm": 0.28125, + "learning_rate": 0.00018565793134015115, + "loss": 0.9572, + "step": 4450 + }, + { + "epoch": 0.26, + "grad_norm": 0.267578125, + "learning_rate": 0.00018560621130954674, + "loss": 1.0208, + "step": 4455 + }, + { + "epoch": 0.26, + "grad_norm": 0.248046875, + "learning_rate": 0.00018555440542189508, + "loss": 0.9392, + "step": 4460 + }, + { + "epoch": 0.26, + "grad_norm": 0.259765625, + "learning_rate": 0.00018550251372915382, + "loss": 1.012, + "step": 4465 + }, + { + "epoch": 0.26, + "grad_norm": 0.240234375, + "learning_rate": 0.00018545053628336668, + "loss": 0.924, + "step": 4470 + }, + { + "epoch": 0.26, + "grad_norm": 0.248046875, + "learning_rate": 0.00018539847313666345, + "loss": 0.9599, + "step": 4475 + }, + { + "epoch": 0.26, + "grad_norm": 0.251953125, + "learning_rate": 0.00018534632434125982, + "loss": 0.9028, + "step": 4480 + }, + { + "epoch": 0.26, + "grad_norm": 0.287109375, + "learning_rate": 0.00018529408994945738, + "loss": 1.0321, + "step": 4485 + }, + { + "epoch": 0.26, + "grad_norm": 0.265625, + "learning_rate": 0.0001852417700136436, + "loss": 0.9446, + "step": 4490 + }, + { + "epoch": 0.26, + "grad_norm": 0.25, + "learning_rate": 0.00018518936458629165, + "loss": 0.9477, + "step": 4495 + }, + { + "epoch": 0.26, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018513687371996058, + "loss": 0.9671, + "step": 4500 + }, + { + "epoch": 0.26, + "grad_norm": 0.251953125, + "learning_rate": 0.000185084297467295, + "loss": 0.8988, + "step": 4505 + }, + { + "epoch": 0.26, + "grad_norm": 0.248046875, + "learning_rate": 0.0001850316358810253, + "loss": 0.9879, + "step": 4510 + }, + { + "epoch": 0.26, + "grad_norm": 0.2890625, + "learning_rate": 0.0001849788890139673, + "loss": 0.9617, + "step": 4515 + }, + { + "epoch": 0.26, + "grad_norm": 0.25390625, + "learning_rate": 0.00018492605691902242, + "loss": 0.9869, + "step": 4520 + }, + { + "epoch": 0.26, + "grad_norm": 0.2578125, + "learning_rate": 0.00018487313964917761, + "loss": 0.9515, + "step": 4525 + }, + { + "epoch": 0.26, + "grad_norm": 0.263671875, + "learning_rate": 0.00018482013725750512, + "loss": 0.9748, + "step": 4530 + }, + { + "epoch": 0.26, + "grad_norm": 0.263671875, + "learning_rate": 0.00018476704979716275, + "loss": 1.0375, + "step": 4535 + }, + { + "epoch": 0.26, + "grad_norm": 0.263671875, + "learning_rate": 0.00018471387732139344, + "loss": 0.9816, + "step": 4540 + }, + { + "epoch": 0.26, + "grad_norm": 0.2578125, + "learning_rate": 0.00018466061988352546, + "loss": 0.999, + "step": 4545 + }, + { + "epoch": 0.26, + "grad_norm": 0.267578125, + "learning_rate": 0.00018460727753697234, + "loss": 0.9573, + "step": 4550 + }, + { + "epoch": 0.26, + "grad_norm": 0.287109375, + "learning_rate": 0.00018455385033523268, + "loss": 0.9211, + "step": 4555 + }, + { + "epoch": 0.26, + "grad_norm": 0.271484375, + "learning_rate": 0.00018450033833189027, + "loss": 0.9198, + "step": 4560 + }, + { + "epoch": 0.26, + "grad_norm": 0.255859375, + "learning_rate": 0.0001844467415806139, + "loss": 0.9629, + "step": 4565 + }, + { + "epoch": 0.26, + "grad_norm": 0.25, + "learning_rate": 0.00018439306013515733, + "loss": 0.9674, + "step": 4570 + }, + { + "epoch": 0.26, + "grad_norm": 0.26953125, + "learning_rate": 0.00018433929404935935, + "loss": 0.9872, + "step": 4575 + }, + { + "epoch": 0.26, + "grad_norm": 0.259765625, + "learning_rate": 0.00018428544337714358, + "loss": 1.0281, + "step": 4580 + }, + { + "epoch": 0.26, + "grad_norm": 0.267578125, + "learning_rate": 0.00018423150817251845, + "loss": 0.9846, + "step": 4585 + }, + { + "epoch": 0.26, + "grad_norm": 0.294921875, + "learning_rate": 0.0001841774884895772, + "loss": 0.9727, + "step": 4590 + }, + { + "epoch": 0.26, + "grad_norm": 0.263671875, + "learning_rate": 0.00018412338438249782, + "loss": 0.9988, + "step": 4595 + }, + { + "epoch": 0.26, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018406919590554296, + "loss": 0.9363, + "step": 4600 + }, + { + "epoch": 0.26, + "grad_norm": 0.271484375, + "learning_rate": 0.00018401492311305985, + "loss": 0.9301, + "step": 4605 + }, + { + "epoch": 0.26, + "grad_norm": 0.267578125, + "learning_rate": 0.00018396056605948032, + "loss": 0.9556, + "step": 4610 + }, + { + "epoch": 0.26, + "grad_norm": 0.2578125, + "learning_rate": 0.00018390612479932066, + "loss": 1.0037, + "step": 4615 + }, + { + "epoch": 0.27, + "grad_norm": 0.271484375, + "learning_rate": 0.00018385159938718172, + "loss": 1.0031, + "step": 4620 + }, + { + "epoch": 0.27, + "grad_norm": 0.25, + "learning_rate": 0.00018379698987774858, + "loss": 0.9881, + "step": 4625 + }, + { + "epoch": 0.27, + "grad_norm": 0.2734375, + "learning_rate": 0.00018374229632579087, + "loss": 0.9284, + "step": 4630 + }, + { + "epoch": 0.27, + "grad_norm": 0.275390625, + "learning_rate": 0.00018368751878616234, + "loss": 0.9625, + "step": 4635 + }, + { + "epoch": 0.27, + "grad_norm": 0.265625, + "learning_rate": 0.00018363265731380102, + "loss": 0.9116, + "step": 4640 + }, + { + "epoch": 0.27, + "grad_norm": 0.27734375, + "learning_rate": 0.00018357771196372916, + "loss": 0.9498, + "step": 4645 + }, + { + "epoch": 0.27, + "grad_norm": 0.271484375, + "learning_rate": 0.00018352268279105314, + "loss": 0.9198, + "step": 4650 + }, + { + "epoch": 0.27, + "grad_norm": 0.2578125, + "learning_rate": 0.0001834675698509633, + "loss": 0.9423, + "step": 4655 + }, + { + "epoch": 0.27, + "grad_norm": 0.259765625, + "learning_rate": 0.0001834123731987341, + "loss": 0.9621, + "step": 4660 + }, + { + "epoch": 0.27, + "grad_norm": 0.267578125, + "learning_rate": 0.00018335709288972395, + "loss": 0.9525, + "step": 4665 + }, + { + "epoch": 0.27, + "grad_norm": 0.265625, + "learning_rate": 0.00018330172897937513, + "loss": 0.9146, + "step": 4670 + }, + { + "epoch": 0.27, + "grad_norm": 0.2734375, + "learning_rate": 0.00018324628152321373, + "loss": 0.9209, + "step": 4675 + }, + { + "epoch": 0.27, + "grad_norm": 0.271484375, + "learning_rate": 0.00018319075057684968, + "loss": 1.0023, + "step": 4680 + }, + { + "epoch": 0.27, + "grad_norm": 0.255859375, + "learning_rate": 0.0001831351361959767, + "loss": 0.9152, + "step": 4685 + }, + { + "epoch": 0.27, + "grad_norm": 0.27734375, + "learning_rate": 0.0001830794384363721, + "loss": 0.9415, + "step": 4690 + }, + { + "epoch": 0.27, + "grad_norm": 0.267578125, + "learning_rate": 0.00018302365735389678, + "loss": 1.0131, + "step": 4695 + }, + { + "epoch": 0.27, + "grad_norm": 0.271484375, + "learning_rate": 0.00018296779300449535, + "loss": 0.9834, + "step": 4700 + }, + { + "epoch": 0.27, + "grad_norm": 0.251953125, + "learning_rate": 0.00018291184544419578, + "loss": 0.8716, + "step": 4705 + }, + { + "epoch": 0.27, + "grad_norm": 0.279296875, + "learning_rate": 0.00018285581472910964, + "loss": 1.0339, + "step": 4710 + }, + { + "epoch": 0.27, + "grad_norm": 0.24609375, + "learning_rate": 0.00018279970091543174, + "loss": 0.9977, + "step": 4715 + }, + { + "epoch": 0.27, + "grad_norm": 0.26171875, + "learning_rate": 0.0001827435040594404, + "loss": 0.9412, + "step": 4720 + }, + { + "epoch": 0.27, + "grad_norm": 0.275390625, + "learning_rate": 0.00018268722421749703, + "loss": 1.0184, + "step": 4725 + }, + { + "epoch": 0.27, + "grad_norm": 0.25390625, + "learning_rate": 0.0001826308614460465, + "loss": 0.9689, + "step": 4730 + }, + { + "epoch": 0.27, + "grad_norm": 0.306640625, + "learning_rate": 0.00018257441580161663, + "loss": 0.9036, + "step": 4735 + }, + { + "epoch": 0.27, + "grad_norm": 0.263671875, + "learning_rate": 0.00018251788734081849, + "loss": 0.9537, + "step": 4740 + }, + { + "epoch": 0.27, + "grad_norm": 0.271484375, + "learning_rate": 0.0001824612761203462, + "loss": 0.9667, + "step": 4745 + }, + { + "epoch": 0.27, + "grad_norm": 0.25390625, + "learning_rate": 0.00018240458219697685, + "loss": 1.0325, + "step": 4750 + }, + { + "epoch": 0.27, + "grad_norm": 0.287109375, + "learning_rate": 0.00018234780562757045, + "loss": 0.96, + "step": 4755 + }, + { + "epoch": 0.27, + "grad_norm": 0.279296875, + "learning_rate": 0.00018229094646906997, + "loss": 0.9985, + "step": 4760 + }, + { + "epoch": 0.27, + "grad_norm": 0.263671875, + "learning_rate": 0.00018223400477850117, + "loss": 0.9671, + "step": 4765 + }, + { + "epoch": 0.27, + "grad_norm": 0.27734375, + "learning_rate": 0.00018217698061297254, + "loss": 0.9163, + "step": 4770 + }, + { + "epoch": 0.27, + "grad_norm": 0.275390625, + "learning_rate": 0.00018211987402967536, + "loss": 0.9751, + "step": 4775 + }, + { + "epoch": 0.27, + "grad_norm": 0.259765625, + "learning_rate": 0.0001820626850858836, + "loss": 0.9423, + "step": 4780 + }, + { + "epoch": 0.27, + "grad_norm": 0.259765625, + "learning_rate": 0.00018200541383895367, + "loss": 0.9343, + "step": 4785 + }, + { + "epoch": 0.27, + "grad_norm": 0.279296875, + "learning_rate": 0.0001819480603463247, + "loss": 0.9399, + "step": 4790 + }, + { + "epoch": 0.28, + "grad_norm": 0.275390625, + "learning_rate": 0.00018189062466551824, + "loss": 1.035, + "step": 4795 + }, + { + "epoch": 0.28, + "grad_norm": 0.2578125, + "learning_rate": 0.0001818331068541382, + "loss": 0.9444, + "step": 4800 + }, + { + "epoch": 0.28, + "grad_norm": 0.26953125, + "learning_rate": 0.00018177550696987096, + "loss": 0.9632, + "step": 4805 + }, + { + "epoch": 0.28, + "grad_norm": 0.271484375, + "learning_rate": 0.0001817178250704852, + "loss": 0.913, + "step": 4810 + }, + { + "epoch": 0.28, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018166006121383185, + "loss": 0.918, + "step": 4815 + }, + { + "epoch": 0.28, + "grad_norm": 0.275390625, + "learning_rate": 0.00018160221545784392, + "loss": 0.958, + "step": 4820 + }, + { + "epoch": 0.28, + "grad_norm": 0.26953125, + "learning_rate": 0.00018154428786053677, + "loss": 1.1622, + "step": 4825 + }, + { + "epoch": 0.28, + "grad_norm": 0.26171875, + "learning_rate": 0.00018148627848000768, + "loss": 0.9385, + "step": 4830 + }, + { + "epoch": 0.28, + "grad_norm": 0.255859375, + "learning_rate": 0.00018142818737443603, + "loss": 0.9235, + "step": 4835 + }, + { + "epoch": 0.28, + "grad_norm": 0.2578125, + "learning_rate": 0.00018137001460208309, + "loss": 1.0112, + "step": 4840 + }, + { + "epoch": 0.28, + "grad_norm": 0.275390625, + "learning_rate": 0.00018131176022129214, + "loss": 0.9345, + "step": 4845 + }, + { + "epoch": 0.28, + "grad_norm": 0.25, + "learning_rate": 0.00018125342429048825, + "loss": 0.9295, + "step": 4850 + }, + { + "epoch": 0.28, + "grad_norm": 0.26953125, + "learning_rate": 0.00018119500686817824, + "loss": 0.9728, + "step": 4855 + }, + { + "epoch": 0.28, + "grad_norm": 0.265625, + "learning_rate": 0.00018113650801295073, + "loss": 0.9295, + "step": 4860 + }, + { + "epoch": 0.28, + "grad_norm": 0.26171875, + "learning_rate": 0.000181077927783476, + "loss": 0.9564, + "step": 4865 + }, + { + "epoch": 0.28, + "grad_norm": 0.255859375, + "learning_rate": 0.00018101926623850586, + "loss": 0.9301, + "step": 4870 + }, + { + "epoch": 0.28, + "grad_norm": 0.328125, + "learning_rate": 0.00018096052343687382, + "loss": 0.9672, + "step": 4875 + }, + { + "epoch": 0.28, + "grad_norm": 0.275390625, + "learning_rate": 0.00018090169943749476, + "loss": 0.9522, + "step": 4880 + }, + { + "epoch": 0.28, + "grad_norm": 0.29296875, + "learning_rate": 0.00018084279429936504, + "loss": 1.0778, + "step": 4885 + }, + { + "epoch": 0.28, + "grad_norm": 0.263671875, + "learning_rate": 0.00018078380808156245, + "loss": 0.9789, + "step": 4890 + }, + { + "epoch": 0.28, + "grad_norm": 0.333984375, + "learning_rate": 0.00018072474084324593, + "loss": 0.9687, + "step": 4895 + }, + { + "epoch": 0.28, + "grad_norm": 0.248046875, + "learning_rate": 0.00018066559264365593, + "loss": 1.0009, + "step": 4900 + }, + { + "epoch": 0.28, + "grad_norm": 0.279296875, + "learning_rate": 0.00018060636354211385, + "loss": 0.9497, + "step": 4905 + }, + { + "epoch": 0.28, + "grad_norm": 0.259765625, + "learning_rate": 0.0001805470535980224, + "loss": 0.9586, + "step": 4910 + }, + { + "epoch": 0.28, + "grad_norm": 0.255859375, + "learning_rate": 0.0001804876628708653, + "loss": 0.9354, + "step": 4915 + }, + { + "epoch": 0.28, + "grad_norm": 0.275390625, + "learning_rate": 0.00018042819142020727, + "loss": 0.9981, + "step": 4920 + }, + { + "epoch": 0.28, + "grad_norm": 0.271484375, + "learning_rate": 0.00018036863930569408, + "loss": 0.9645, + "step": 4925 + }, + { + "epoch": 0.28, + "grad_norm": 0.275390625, + "learning_rate": 0.00018030900658705227, + "loss": 0.9754, + "step": 4930 + }, + { + "epoch": 0.28, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018024929332408933, + "loss": 0.9265, + "step": 4935 + }, + { + "epoch": 0.28, + "grad_norm": 0.26171875, + "learning_rate": 0.00018018949957669347, + "loss": 0.9608, + "step": 4940 + }, + { + "epoch": 0.28, + "grad_norm": 0.259765625, + "learning_rate": 0.00018012962540483364, + "loss": 0.9572, + "step": 4945 + }, + { + "epoch": 0.28, + "grad_norm": 0.2734375, + "learning_rate": 0.00018006967086855948, + "loss": 0.9657, + "step": 4950 + }, + { + "epoch": 0.28, + "grad_norm": 0.263671875, + "learning_rate": 0.00018000963602800117, + "loss": 1.0154, + "step": 4955 + }, + { + "epoch": 0.28, + "grad_norm": 0.2578125, + "learning_rate": 0.00017994952094336946, + "loss": 0.9399, + "step": 4960 + }, + { + "epoch": 0.28, + "grad_norm": 0.271484375, + "learning_rate": 0.0001798893256749556, + "loss": 0.9801, + "step": 4965 + }, + { + "epoch": 0.29, + "grad_norm": 0.2578125, + "learning_rate": 0.0001798290502831312, + "loss": 0.9232, + "step": 4970 + }, + { + "epoch": 0.29, + "grad_norm": 0.259765625, + "learning_rate": 0.0001797686948283483, + "loss": 0.9539, + "step": 4975 + }, + { + "epoch": 0.29, + "grad_norm": 0.2578125, + "learning_rate": 0.00017970825937113923, + "loss": 0.9298, + "step": 4980 + }, + { + "epoch": 0.29, + "grad_norm": 0.2578125, + "learning_rate": 0.00017964774397211643, + "loss": 0.9667, + "step": 4985 + }, + { + "epoch": 0.29, + "grad_norm": 0.263671875, + "learning_rate": 0.00017958714869197273, + "loss": 0.9768, + "step": 4990 + }, + { + "epoch": 0.29, + "grad_norm": 0.2578125, + "learning_rate": 0.00017952647359148087, + "loss": 0.9718, + "step": 4995 + }, + { + "epoch": 0.29, + "grad_norm": 0.259765625, + "learning_rate": 0.00017946571873149377, + "loss": 0.9546, + "step": 5000 + }, + { + "epoch": 0.29, + "grad_norm": 0.2578125, + "learning_rate": 0.00017940488417294437, + "loss": 1.0762, + "step": 5005 + }, + { + "epoch": 0.29, + "grad_norm": 0.271484375, + "learning_rate": 0.00017934396997684537, + "loss": 0.9056, + "step": 5010 + }, + { + "epoch": 0.29, + "grad_norm": 0.25390625, + "learning_rate": 0.00017928297620428953, + "loss": 0.9799, + "step": 5015 + }, + { + "epoch": 0.29, + "grad_norm": 0.26953125, + "learning_rate": 0.00017922190291644934, + "loss": 0.9785, + "step": 5020 + }, + { + "epoch": 0.29, + "grad_norm": 0.251953125, + "learning_rate": 0.00017916075017457698, + "loss": 0.9463, + "step": 5025 + }, + { + "epoch": 0.29, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017909951804000445, + "loss": 0.9665, + "step": 5030 + }, + { + "epoch": 0.29, + "grad_norm": 0.26171875, + "learning_rate": 0.0001790382065741432, + "loss": 0.9876, + "step": 5035 + }, + { + "epoch": 0.29, + "grad_norm": 0.267578125, + "learning_rate": 0.00017897681583848449, + "loss": 0.9929, + "step": 5040 + }, + { + "epoch": 0.29, + "grad_norm": 0.265625, + "learning_rate": 0.00017891534589459883, + "loss": 0.991, + "step": 5045 + }, + { + "epoch": 0.29, + "grad_norm": 0.27734375, + "learning_rate": 0.00017885379680413627, + "loss": 0.9451, + "step": 5050 + }, + { + "epoch": 0.29, + "grad_norm": 0.271484375, + "learning_rate": 0.0001787921686288263, + "loss": 0.9316, + "step": 5055 + }, + { + "epoch": 0.29, + "grad_norm": 0.259765625, + "learning_rate": 0.00017873046143047767, + "loss": 0.9187, + "step": 5060 + }, + { + "epoch": 0.29, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017866867527097837, + "loss": 0.9836, + "step": 5065 + }, + { + "epoch": 0.29, + "grad_norm": 0.265625, + "learning_rate": 0.0001786068102122956, + "loss": 0.9655, + "step": 5070 + }, + { + "epoch": 0.29, + "grad_norm": 0.275390625, + "learning_rate": 0.00017854486631647569, + "loss": 0.9407, + "step": 5075 + }, + { + "epoch": 0.29, + "grad_norm": 0.2578125, + "learning_rate": 0.00017848284364564406, + "loss": 0.9331, + "step": 5080 + }, + { + "epoch": 0.29, + "grad_norm": 0.271484375, + "learning_rate": 0.00017842074226200505, + "loss": 1.0075, + "step": 5085 + }, + { + "epoch": 0.29, + "grad_norm": 0.24609375, + "learning_rate": 0.0001783585622278421, + "loss": 0.8936, + "step": 5090 + }, + { + "epoch": 0.29, + "grad_norm": 0.265625, + "learning_rate": 0.00017829630360551737, + "loss": 0.963, + "step": 5095 + }, + { + "epoch": 0.29, + "grad_norm": 0.26953125, + "learning_rate": 0.0001782339664574719, + "loss": 0.9445, + "step": 5100 + }, + { + "epoch": 0.29, + "grad_norm": 0.267578125, + "learning_rate": 0.00017817155084622562, + "loss": 0.9558, + "step": 5105 + }, + { + "epoch": 0.29, + "grad_norm": 0.263671875, + "learning_rate": 0.00017810905683437683, + "loss": 0.9419, + "step": 5110 + }, + { + "epoch": 0.29, + "grad_norm": 0.283203125, + "learning_rate": 0.0001780464844846028, + "loss": 1.0337, + "step": 5115 + }, + { + "epoch": 0.29, + "grad_norm": 0.259765625, + "learning_rate": 0.00017798383385965918, + "loss": 0.9724, + "step": 5120 + }, + { + "epoch": 0.29, + "grad_norm": 0.28125, + "learning_rate": 0.00017792110502238016, + "loss": 0.971, + "step": 5125 + }, + { + "epoch": 0.29, + "grad_norm": 0.259765625, + "learning_rate": 0.0001778582980356784, + "loss": 0.9743, + "step": 5130 + }, + { + "epoch": 0.29, + "grad_norm": 0.275390625, + "learning_rate": 0.00017779541296254487, + "loss": 1.0086, + "step": 5135 + }, + { + "epoch": 0.29, + "grad_norm": 0.26953125, + "learning_rate": 0.00017773244986604895, + "loss": 0.9543, + "step": 5140 + }, + { + "epoch": 0.3, + "grad_norm": 0.275390625, + "learning_rate": 0.00017766940880933825, + "loss": 0.9424, + "step": 5145 + }, + { + "epoch": 0.3, + "grad_norm": 0.2578125, + "learning_rate": 0.00017760628985563845, + "loss": 0.9988, + "step": 5150 + }, + { + "epoch": 0.3, + "grad_norm": 0.248046875, + "learning_rate": 0.00017754309306825357, + "loss": 0.8561, + "step": 5155 + }, + { + "epoch": 0.3, + "grad_norm": 0.2578125, + "learning_rate": 0.00017747981851056548, + "loss": 0.9413, + "step": 5160 + }, + { + "epoch": 0.3, + "grad_norm": 0.25390625, + "learning_rate": 0.00017741646624603417, + "loss": 0.9704, + "step": 5165 + }, + { + "epoch": 0.3, + "grad_norm": 0.263671875, + "learning_rate": 0.00017735303633819753, + "loss": 1.0081, + "step": 5170 + }, + { + "epoch": 0.3, + "grad_norm": 0.259765625, + "learning_rate": 0.00017728952885067133, + "loss": 0.9601, + "step": 5175 + }, + { + "epoch": 0.3, + "grad_norm": 0.2578125, + "learning_rate": 0.00017722594384714916, + "loss": 0.9446, + "step": 5180 + }, + { + "epoch": 0.3, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017716228139140228, + "loss": 0.9643, + "step": 5185 + }, + { + "epoch": 0.3, + "grad_norm": 0.248046875, + "learning_rate": 0.00017709854154727975, + "loss": 0.9975, + "step": 5190 + }, + { + "epoch": 0.3, + "grad_norm": 0.2890625, + "learning_rate": 0.00017703472437870813, + "loss": 1.0159, + "step": 5195 + }, + { + "epoch": 0.3, + "grad_norm": 0.26171875, + "learning_rate": 0.00017697082994969158, + "loss": 1.0124, + "step": 5200 + }, + { + "epoch": 0.3, + "grad_norm": 0.2734375, + "learning_rate": 0.0001769068583243118, + "loss": 0.9084, + "step": 5205 + }, + { + "epoch": 0.3, + "grad_norm": 0.2578125, + "learning_rate": 0.0001768428095667278, + "loss": 0.8991, + "step": 5210 + }, + { + "epoch": 0.3, + "grad_norm": 0.27734375, + "learning_rate": 0.00017677868374117606, + "loss": 0.9662, + "step": 5215 + }, + { + "epoch": 0.3, + "grad_norm": 0.26953125, + "learning_rate": 0.00017671448091197026, + "loss": 0.9955, + "step": 5220 + }, + { + "epoch": 0.3, + "grad_norm": 0.265625, + "learning_rate": 0.00017665020114350136, + "loss": 0.9421, + "step": 5225 + }, + { + "epoch": 0.3, + "grad_norm": 0.255859375, + "learning_rate": 0.00017658584450023747, + "loss": 0.9946, + "step": 5230 + }, + { + "epoch": 0.3, + "grad_norm": 0.26953125, + "learning_rate": 0.0001765214110467238, + "loss": 0.9487, + "step": 5235 + }, + { + "epoch": 0.3, + "grad_norm": 0.248046875, + "learning_rate": 0.00017645690084758267, + "loss": 0.9412, + "step": 5240 + }, + { + "epoch": 0.3, + "grad_norm": 0.279296875, + "learning_rate": 0.00017639231396751322, + "loss": 0.9802, + "step": 5245 + }, + { + "epoch": 0.3, + "grad_norm": 0.2578125, + "learning_rate": 0.00017632765047129157, + "loss": 0.9389, + "step": 5250 + }, + { + "epoch": 0.3, + "grad_norm": 0.255859375, + "learning_rate": 0.00017626291042377077, + "loss": 0.9462, + "step": 5255 + }, + { + "epoch": 0.3, + "grad_norm": 0.267578125, + "learning_rate": 0.00017619809388988049, + "loss": 0.9653, + "step": 5260 + }, + { + "epoch": 0.3, + "grad_norm": 0.25390625, + "learning_rate": 0.00017613320093462723, + "loss": 0.9345, + "step": 5265 + }, + { + "epoch": 0.3, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017606823162309406, + "loss": 0.9675, + "step": 5270 + }, + { + "epoch": 0.3, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017600318602044066, + "loss": 1.0095, + "step": 5275 + }, + { + "epoch": 0.3, + "grad_norm": 0.275390625, + "learning_rate": 0.00017593806419190325, + "loss": 0.9232, + "step": 5280 + }, + { + "epoch": 0.3, + "grad_norm": 0.265625, + "learning_rate": 0.00017587286620279443, + "loss": 0.9346, + "step": 5285 + }, + { + "epoch": 0.3, + "grad_norm": 0.26953125, + "learning_rate": 0.00017580759211850323, + "loss": 0.9148, + "step": 5290 + }, + { + "epoch": 0.3, + "grad_norm": 0.265625, + "learning_rate": 0.00017574224200449506, + "loss": 0.9724, + "step": 5295 + }, + { + "epoch": 0.3, + "grad_norm": 0.259765625, + "learning_rate": 0.00017567681592631145, + "loss": 0.958, + "step": 5300 + }, + { + "epoch": 0.3, + "grad_norm": 0.279296875, + "learning_rate": 0.00017561131394957022, + "loss": 0.9913, + "step": 5305 + }, + { + "epoch": 0.3, + "grad_norm": 0.263671875, + "learning_rate": 0.00017554573613996524, + "loss": 0.9496, + "step": 5310 + }, + { + "epoch": 0.3, + "grad_norm": 0.30859375, + "learning_rate": 0.00017548008256326655, + "loss": 0.9953, + "step": 5315 + }, + { + "epoch": 0.31, + "grad_norm": 0.26171875, + "learning_rate": 0.00017541435328531996, + "loss": 0.9762, + "step": 5320 + }, + { + "epoch": 0.31, + "grad_norm": 0.28125, + "learning_rate": 0.00017534854837204745, + "loss": 0.9767, + "step": 5325 + }, + { + "epoch": 0.31, + "grad_norm": 0.2578125, + "learning_rate": 0.00017528266788944676, + "loss": 0.9449, + "step": 5330 + }, + { + "epoch": 0.31, + "grad_norm": 0.25390625, + "learning_rate": 0.00017521671190359132, + "loss": 0.9, + "step": 5335 + }, + { + "epoch": 0.31, + "grad_norm": 0.25390625, + "learning_rate": 0.00017515068048063048, + "loss": 0.9287, + "step": 5340 + }, + { + "epoch": 0.31, + "grad_norm": 0.2734375, + "learning_rate": 0.00017508457368678904, + "loss": 0.9636, + "step": 5345 + }, + { + "epoch": 0.31, + "grad_norm": 0.271484375, + "learning_rate": 0.00017501839158836756, + "loss": 0.9587, + "step": 5350 + }, + { + "epoch": 0.31, + "grad_norm": 0.25390625, + "learning_rate": 0.00017495213425174205, + "loss": 0.9104, + "step": 5355 + }, + { + "epoch": 0.31, + "grad_norm": 0.28515625, + "learning_rate": 0.000174885801743364, + "loss": 0.9747, + "step": 5360 + }, + { + "epoch": 0.31, + "grad_norm": 0.265625, + "learning_rate": 0.00017481939412976024, + "loss": 0.9149, + "step": 5365 + }, + { + "epoch": 0.31, + "grad_norm": 0.27734375, + "learning_rate": 0.00017475291147753299, + "loss": 0.9608, + "step": 5370 + }, + { + "epoch": 0.31, + "grad_norm": 0.251953125, + "learning_rate": 0.0001746863538533597, + "loss": 0.9185, + "step": 5375 + }, + { + "epoch": 0.31, + "grad_norm": 0.2734375, + "learning_rate": 0.000174619721323993, + "loss": 0.9215, + "step": 5380 + }, + { + "epoch": 0.31, + "grad_norm": 0.255859375, + "learning_rate": 0.0001745530139562607, + "loss": 0.929, + "step": 5385 + }, + { + "epoch": 0.31, + "grad_norm": 0.263671875, + "learning_rate": 0.0001744862318170656, + "loss": 0.9378, + "step": 5390 + }, + { + "epoch": 0.31, + "grad_norm": 0.27734375, + "learning_rate": 0.00017441937497338552, + "loss": 0.9807, + "step": 5395 + }, + { + "epoch": 0.31, + "grad_norm": 0.265625, + "learning_rate": 0.0001743524434922732, + "loss": 0.9641, + "step": 5400 + }, + { + "epoch": 0.31, + "grad_norm": 0.2578125, + "learning_rate": 0.00017428543744085623, + "loss": 1.0143, + "step": 5405 + }, + { + "epoch": 0.31, + "grad_norm": 0.259765625, + "learning_rate": 0.00017421835688633704, + "loss": 0.9707, + "step": 5410 + }, + { + "epoch": 0.31, + "grad_norm": 0.267578125, + "learning_rate": 0.0001741512018959927, + "loss": 1.0347, + "step": 5415 + }, + { + "epoch": 0.31, + "grad_norm": 0.27734375, + "learning_rate": 0.00017408397253717496, + "loss": 1.0184, + "step": 5420 + }, + { + "epoch": 0.31, + "grad_norm": 0.267578125, + "learning_rate": 0.0001740166688773102, + "loss": 0.9867, + "step": 5425 + }, + { + "epoch": 0.31, + "grad_norm": 0.267578125, + "learning_rate": 0.00017394929098389929, + "loss": 0.986, + "step": 5430 + }, + { + "epoch": 0.31, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017388183892451755, + "loss": 0.9799, + "step": 5435 + }, + { + "epoch": 0.31, + "grad_norm": 0.283203125, + "learning_rate": 0.00017381431276681464, + "loss": 0.9343, + "step": 5440 + }, + { + "epoch": 0.31, + "grad_norm": 0.255859375, + "learning_rate": 0.0001737467125785146, + "loss": 0.9886, + "step": 5445 + }, + { + "epoch": 0.31, + "grad_norm": 0.291015625, + "learning_rate": 0.0001736790384274157, + "loss": 1.0098, + "step": 5450 + }, + { + "epoch": 0.31, + "grad_norm": 0.267578125, + "learning_rate": 0.00017361129038139038, + "loss": 0.9989, + "step": 5455 + }, + { + "epoch": 0.31, + "grad_norm": 0.26953125, + "learning_rate": 0.0001735434685083852, + "loss": 0.9337, + "step": 5460 + }, + { + "epoch": 0.31, + "grad_norm": 0.26953125, + "learning_rate": 0.00017347557287642076, + "loss": 1.0127, + "step": 5465 + }, + { + "epoch": 0.31, + "grad_norm": 0.283203125, + "learning_rate": 0.00017340760355359161, + "loss": 0.9243, + "step": 5470 + }, + { + "epoch": 0.31, + "grad_norm": 0.2734375, + "learning_rate": 0.0001733395606080663, + "loss": 0.9355, + "step": 5475 + }, + { + "epoch": 0.31, + "grad_norm": 0.26953125, + "learning_rate": 0.00017327144410808707, + "loss": 0.9836, + "step": 5480 + }, + { + "epoch": 0.31, + "grad_norm": 0.2578125, + "learning_rate": 0.00017320325412197, + "loss": 0.8971, + "step": 5485 + }, + { + "epoch": 0.32, + "grad_norm": 0.283203125, + "learning_rate": 0.00017313499071810497, + "loss": 0.937, + "step": 5490 + }, + { + "epoch": 0.32, + "grad_norm": 0.27734375, + "learning_rate": 0.00017306665396495534, + "loss": 1.0036, + "step": 5495 + }, + { + "epoch": 0.32, + "grad_norm": 0.263671875, + "learning_rate": 0.0001729982439310581, + "loss": 0.9265, + "step": 5500 + }, + { + "epoch": 0.32, + "grad_norm": 0.2578125, + "learning_rate": 0.00017292976068502376, + "loss": 0.9741, + "step": 5505 + }, + { + "epoch": 0.32, + "grad_norm": 0.283203125, + "learning_rate": 0.0001728612042955362, + "loss": 1.0222, + "step": 5510 + }, + { + "epoch": 0.32, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017279257483135272, + "loss": 0.9135, + "step": 5515 + }, + { + "epoch": 0.32, + "grad_norm": 0.298828125, + "learning_rate": 0.00017272387236130383, + "loss": 0.983, + "step": 5520 + }, + { + "epoch": 0.32, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017265509695429335, + "loss": 0.9202, + "step": 5525 + }, + { + "epoch": 0.32, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017258624867929817, + "loss": 0.9667, + "step": 5530 + }, + { + "epoch": 0.32, + "grad_norm": 0.271484375, + "learning_rate": 0.00017251732760536833, + "loss": 0.9566, + "step": 5535 + }, + { + "epoch": 0.32, + "grad_norm": 0.25390625, + "learning_rate": 0.00017244833380162687, + "loss": 0.9045, + "step": 5540 + }, + { + "epoch": 0.32, + "grad_norm": 0.2734375, + "learning_rate": 0.0001723792673372697, + "loss": 0.9259, + "step": 5545 + }, + { + "epoch": 0.32, + "grad_norm": 0.26953125, + "learning_rate": 0.00017231012828156566, + "loss": 0.9829, + "step": 5550 + }, + { + "epoch": 0.32, + "grad_norm": 0.263671875, + "learning_rate": 0.00017224091670385642, + "loss": 0.8636, + "step": 5555 + }, + { + "epoch": 0.32, + "grad_norm": 0.25, + "learning_rate": 0.00017217163267355638, + "loss": 0.9075, + "step": 5560 + }, + { + "epoch": 0.32, + "grad_norm": 0.27734375, + "learning_rate": 0.00017210227626015252, + "loss": 0.9639, + "step": 5565 + }, + { + "epoch": 0.32, + "grad_norm": 0.259765625, + "learning_rate": 0.00017203284753320447, + "loss": 0.9425, + "step": 5570 + }, + { + "epoch": 0.32, + "grad_norm": 0.2734375, + "learning_rate": 0.00017196334656234446, + "loss": 0.9542, + "step": 5575 + }, + { + "epoch": 0.32, + "grad_norm": 0.275390625, + "learning_rate": 0.00017189377341727708, + "loss": 0.9048, + "step": 5580 + }, + { + "epoch": 0.32, + "grad_norm": 0.275390625, + "learning_rate": 0.00017182412816777931, + "loss": 0.924, + "step": 5585 + }, + { + "epoch": 0.32, + "grad_norm": 0.259765625, + "learning_rate": 0.00017175441088370045, + "loss": 0.9078, + "step": 5590 + }, + { + "epoch": 0.32, + "grad_norm": 0.2734375, + "learning_rate": 0.00017168462163496214, + "loss": 0.9432, + "step": 5595 + }, + { + "epoch": 0.32, + "grad_norm": 0.251953125, + "learning_rate": 0.00017161476049155807, + "loss": 0.9207, + "step": 5600 + }, + { + "epoch": 0.32, + "grad_norm": 0.255859375, + "learning_rate": 0.00017154482752355406, + "loss": 1.008, + "step": 5605 + }, + { + "epoch": 0.32, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017147482280108802, + "loss": 0.9298, + "step": 5610 + }, + { + "epoch": 0.32, + "grad_norm": 0.251953125, + "learning_rate": 0.00017140474639436981, + "loss": 0.9983, + "step": 5615 + }, + { + "epoch": 0.32, + "grad_norm": 0.265625, + "learning_rate": 0.0001713345983736811, + "loss": 0.9208, + "step": 5620 + }, + { + "epoch": 0.32, + "grad_norm": 0.294921875, + "learning_rate": 0.00017126437880937557, + "loss": 0.9903, + "step": 5625 + }, + { + "epoch": 0.32, + "grad_norm": 0.271484375, + "learning_rate": 0.00017119408777187842, + "loss": 0.959, + "step": 5630 + }, + { + "epoch": 0.32, + "grad_norm": 0.251953125, + "learning_rate": 0.00017112372533168672, + "loss": 0.9756, + "step": 5635 + }, + { + "epoch": 0.32, + "grad_norm": 0.271484375, + "learning_rate": 0.00017105329155936905, + "loss": 0.9773, + "step": 5640 + }, + { + "epoch": 0.32, + "grad_norm": 0.251953125, + "learning_rate": 0.0001709827865255656, + "loss": 0.9547, + "step": 5645 + }, + { + "epoch": 0.32, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001709122103009879, + "loss": 1.0082, + "step": 5650 + }, + { + "epoch": 0.32, + "grad_norm": 0.259765625, + "learning_rate": 0.00017084156295641906, + "loss": 1.0108, + "step": 5655 + }, + { + "epoch": 0.32, + "grad_norm": 0.26171875, + "learning_rate": 0.0001707708445627134, + "loss": 0.9202, + "step": 5660 + }, + { + "epoch": 0.33, + "grad_norm": 0.267578125, + "learning_rate": 0.00017070005519079652, + "loss": 0.9213, + "step": 5665 + }, + { + "epoch": 0.33, + "grad_norm": 0.265625, + "learning_rate": 0.00017062919491166523, + "loss": 0.9814, + "step": 5670 + }, + { + "epoch": 0.33, + "grad_norm": 0.25390625, + "learning_rate": 0.00017055826379638742, + "loss": 0.9815, + "step": 5675 + }, + { + "epoch": 0.33, + "grad_norm": 0.3046875, + "learning_rate": 0.00017048726191610202, + "loss": 0.9683, + "step": 5680 + }, + { + "epoch": 0.33, + "grad_norm": 0.259765625, + "learning_rate": 0.00017041618934201904, + "loss": 0.9397, + "step": 5685 + }, + { + "epoch": 0.33, + "grad_norm": 0.26171875, + "learning_rate": 0.0001703450461454192, + "loss": 1.0024, + "step": 5690 + }, + { + "epoch": 0.33, + "grad_norm": 0.251953125, + "learning_rate": 0.00017027383239765422, + "loss": 0.9648, + "step": 5695 + }, + { + "epoch": 0.33, + "grad_norm": 0.265625, + "learning_rate": 0.0001702025481701465, + "loss": 0.924, + "step": 5700 + }, + { + "epoch": 0.33, + "grad_norm": 0.25390625, + "learning_rate": 0.00017013119353438913, + "loss": 0.9824, + "step": 5705 + }, + { + "epoch": 0.33, + "grad_norm": 0.259765625, + "learning_rate": 0.00017005976856194582, + "loss": 1.0145, + "step": 5710 + }, + { + "epoch": 0.33, + "grad_norm": 0.2578125, + "learning_rate": 0.00016998827332445084, + "loss": 0.9222, + "step": 5715 + }, + { + "epoch": 0.33, + "grad_norm": 0.259765625, + "learning_rate": 0.0001699167078936089, + "loss": 0.8669, + "step": 5720 + }, + { + "epoch": 0.33, + "grad_norm": 0.25390625, + "learning_rate": 0.0001698450723411951, + "loss": 0.8831, + "step": 5725 + }, + { + "epoch": 0.33, + "grad_norm": 0.291015625, + "learning_rate": 0.00016977336673905497, + "loss": 0.9277, + "step": 5730 + }, + { + "epoch": 0.33, + "grad_norm": 0.34765625, + "learning_rate": 0.00016970159115910417, + "loss": 0.9618, + "step": 5735 + }, + { + "epoch": 0.33, + "grad_norm": 0.263671875, + "learning_rate": 0.00016962974567332858, + "loss": 0.9225, + "step": 5740 + }, + { + "epoch": 0.33, + "grad_norm": 0.27734375, + "learning_rate": 0.00016955783035378424, + "loss": 0.9228, + "step": 5745 + }, + { + "epoch": 0.33, + "grad_norm": 0.27734375, + "learning_rate": 0.00016948584527259715, + "loss": 0.9805, + "step": 5750 + }, + { + "epoch": 0.33, + "grad_norm": 0.2734375, + "learning_rate": 0.0001694137905019633, + "loss": 0.9336, + "step": 5755 + }, + { + "epoch": 0.33, + "grad_norm": 0.26171875, + "learning_rate": 0.00016934166611414867, + "loss": 0.9763, + "step": 5760 + }, + { + "epoch": 0.33, + "grad_norm": 0.26953125, + "learning_rate": 0.0001692694721814889, + "loss": 0.9667, + "step": 5765 + }, + { + "epoch": 0.33, + "grad_norm": 0.251953125, + "learning_rate": 0.0001691972087763895, + "loss": 0.9347, + "step": 5770 + }, + { + "epoch": 0.33, + "grad_norm": 0.255859375, + "learning_rate": 0.0001691248759713256, + "loss": 1.0006, + "step": 5775 + }, + { + "epoch": 0.33, + "grad_norm": 0.251953125, + "learning_rate": 0.00016905247383884196, + "loss": 0.9602, + "step": 5780 + }, + { + "epoch": 0.33, + "grad_norm": 0.302734375, + "learning_rate": 0.00016898000245155282, + "loss": 0.9398, + "step": 5785 + }, + { + "epoch": 0.33, + "grad_norm": 0.27734375, + "learning_rate": 0.000168907461882142, + "loss": 0.9302, + "step": 5790 + }, + { + "epoch": 0.33, + "grad_norm": 0.294921875, + "learning_rate": 0.00016883485220336257, + "loss": 0.916, + "step": 5795 + }, + { + "epoch": 0.33, + "grad_norm": 0.255859375, + "learning_rate": 0.00016876217348803693, + "loss": 0.889, + "step": 5800 + }, + { + "epoch": 0.33, + "grad_norm": 0.279296875, + "learning_rate": 0.00016868942580905677, + "loss": 0.9243, + "step": 5805 + }, + { + "epoch": 0.33, + "grad_norm": 0.279296875, + "learning_rate": 0.000168616609239383, + "loss": 0.9543, + "step": 5810 + }, + { + "epoch": 0.33, + "grad_norm": 0.259765625, + "learning_rate": 0.0001685437238520455, + "loss": 1.0033, + "step": 5815 + }, + { + "epoch": 0.33, + "grad_norm": 0.267578125, + "learning_rate": 0.00016847076972014316, + "loss": 1.0096, + "step": 5820 + }, + { + "epoch": 0.33, + "grad_norm": 0.275390625, + "learning_rate": 0.00016839774691684395, + "loss": 0.9758, + "step": 5825 + }, + { + "epoch": 0.33, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016832465551538465, + "loss": 0.9468, + "step": 5830 + }, + { + "epoch": 0.33, + "grad_norm": 0.25, + "learning_rate": 0.00016825149558907074, + "loss": 0.9649, + "step": 5835 + }, + { + "epoch": 0.34, + "grad_norm": 0.25390625, + "learning_rate": 0.0001681782672112766, + "loss": 0.9563, + "step": 5840 + }, + { + "epoch": 0.34, + "grad_norm": 0.271484375, + "learning_rate": 0.00016810497045544515, + "loss": 0.9937, + "step": 5845 + }, + { + "epoch": 0.34, + "grad_norm": 0.2734375, + "learning_rate": 0.0001680316053950879, + "loss": 0.9506, + "step": 5850 + }, + { + "epoch": 0.34, + "grad_norm": 0.255859375, + "learning_rate": 0.00016795817210378487, + "loss": 0.9616, + "step": 5855 + }, + { + "epoch": 0.34, + "grad_norm": 0.26171875, + "learning_rate": 0.0001678846706551846, + "loss": 0.9319, + "step": 5860 + }, + { + "epoch": 0.34, + "grad_norm": 0.2451171875, + "learning_rate": 0.00016781110112300377, + "loss": 0.9158, + "step": 5865 + }, + { + "epoch": 0.34, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001677374635810276, + "loss": 1.0142, + "step": 5870 + }, + { + "epoch": 0.34, + "grad_norm": 0.24609375, + "learning_rate": 0.00016766375810310934, + "loss": 0.9922, + "step": 5875 + }, + { + "epoch": 0.34, + "grad_norm": 0.25390625, + "learning_rate": 0.00016758998476317042, + "loss": 0.8845, + "step": 5880 + }, + { + "epoch": 0.34, + "grad_norm": 0.279296875, + "learning_rate": 0.0001675161436352004, + "loss": 0.9652, + "step": 5885 + }, + { + "epoch": 0.34, + "grad_norm": 0.248046875, + "learning_rate": 0.0001674422347932567, + "loss": 0.9629, + "step": 5890 + }, + { + "epoch": 0.34, + "grad_norm": 0.267578125, + "learning_rate": 0.00016736825831146482, + "loss": 0.9261, + "step": 5895 + }, + { + "epoch": 0.34, + "grad_norm": 0.26953125, + "learning_rate": 0.00016729421426401787, + "loss": 0.8938, + "step": 5900 + }, + { + "epoch": 0.34, + "grad_norm": 0.275390625, + "learning_rate": 0.000167220102725177, + "loss": 0.9534, + "step": 5905 + }, + { + "epoch": 0.34, + "grad_norm": 0.2578125, + "learning_rate": 0.0001671459237692708, + "loss": 0.8848, + "step": 5910 + }, + { + "epoch": 0.34, + "grad_norm": 0.263671875, + "learning_rate": 0.00016707167747069562, + "loss": 1.0258, + "step": 5915 + }, + { + "epoch": 0.34, + "grad_norm": 0.255859375, + "learning_rate": 0.0001669973639039153, + "loss": 0.9314, + "step": 5920 + }, + { + "epoch": 0.34, + "grad_norm": 0.2578125, + "learning_rate": 0.0001669229831434611, + "loss": 1.0198, + "step": 5925 + }, + { + "epoch": 0.34, + "grad_norm": 0.26171875, + "learning_rate": 0.00016684853526393185, + "loss": 1.0186, + "step": 5930 + }, + { + "epoch": 0.34, + "grad_norm": 0.259765625, + "learning_rate": 0.00016677402033999346, + "loss": 0.947, + "step": 5935 + }, + { + "epoch": 0.34, + "grad_norm": 0.2470703125, + "learning_rate": 0.00016669943844637924, + "loss": 0.9626, + "step": 5940 + }, + { + "epoch": 0.34, + "grad_norm": 0.279296875, + "learning_rate": 0.00016662478965788962, + "loss": 1.0421, + "step": 5945 + }, + { + "epoch": 0.34, + "grad_norm": 0.28515625, + "learning_rate": 0.00016655007404939212, + "loss": 0.8739, + "step": 5950 + }, + { + "epoch": 0.34, + "grad_norm": 0.419921875, + "learning_rate": 0.00016647529169582122, + "loss": 0.9454, + "step": 5955 + }, + { + "epoch": 0.34, + "grad_norm": 0.2431640625, + "learning_rate": 0.00016640044267217846, + "loss": 0.918, + "step": 5960 + }, + { + "epoch": 0.34, + "grad_norm": 0.267578125, + "learning_rate": 0.00016632552705353213, + "loss": 1.0082, + "step": 5965 + }, + { + "epoch": 0.34, + "grad_norm": 0.271484375, + "learning_rate": 0.00016625054491501738, + "loss": 0.9736, + "step": 5970 + }, + { + "epoch": 0.34, + "grad_norm": 0.271484375, + "learning_rate": 0.00016617549633183608, + "loss": 1.0407, + "step": 5975 + }, + { + "epoch": 0.34, + "grad_norm": 0.255859375, + "learning_rate": 0.00016610038137925668, + "loss": 0.9043, + "step": 5980 + }, + { + "epoch": 0.34, + "grad_norm": 0.287109375, + "learning_rate": 0.00016602520013261424, + "loss": 0.9397, + "step": 5985 + }, + { + "epoch": 0.34, + "grad_norm": 0.2578125, + "learning_rate": 0.0001659499526673103, + "loss": 0.9678, + "step": 5990 + }, + { + "epoch": 0.34, + "grad_norm": 0.25390625, + "learning_rate": 0.0001658746390588128, + "loss": 1.0472, + "step": 5995 + }, + { + "epoch": 0.34, + "grad_norm": 0.28125, + "learning_rate": 0.00016579925938265606, + "loss": 0.9778, + "step": 6000 + }, + { + "epoch": 0.34, + "grad_norm": 0.26953125, + "learning_rate": 0.00016572381371444058, + "loss": 0.9727, + "step": 6005 + }, + { + "epoch": 0.34, + "grad_norm": 0.265625, + "learning_rate": 0.0001656483021298331, + "loss": 1.0312, + "step": 6010 + }, + { + "epoch": 0.35, + "grad_norm": 0.255859375, + "learning_rate": 0.00016557272470456646, + "loss": 0.9426, + "step": 6015 + }, + { + "epoch": 0.35, + "grad_norm": 0.271484375, + "learning_rate": 0.00016549708151443956, + "loss": 0.8717, + "step": 6020 + }, + { + "epoch": 0.35, + "grad_norm": 0.28515625, + "learning_rate": 0.00016542137263531723, + "loss": 0.9256, + "step": 6025 + }, + { + "epoch": 0.35, + "grad_norm": 0.2734375, + "learning_rate": 0.00016534559814313017, + "loss": 0.9841, + "step": 6030 + }, + { + "epoch": 0.35, + "grad_norm": 0.2890625, + "learning_rate": 0.00016526975811387493, + "loss": 1.0084, + "step": 6035 + }, + { + "epoch": 0.35, + "grad_norm": 0.26953125, + "learning_rate": 0.00016519385262361372, + "loss": 1.003, + "step": 6040 + }, + { + "epoch": 0.35, + "grad_norm": 0.27734375, + "learning_rate": 0.00016511788174847444, + "loss": 0.9585, + "step": 6045 + }, + { + "epoch": 0.35, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001650418455646506, + "loss": 0.9436, + "step": 6050 + }, + { + "epoch": 0.35, + "grad_norm": 0.28515625, + "learning_rate": 0.00016496574414840117, + "loss": 0.9489, + "step": 6055 + }, + { + "epoch": 0.35, + "grad_norm": 0.259765625, + "learning_rate": 0.00016488957757605056, + "loss": 0.9298, + "step": 6060 + }, + { + "epoch": 0.35, + "grad_norm": 0.298828125, + "learning_rate": 0.0001648133459239885, + "loss": 1.0384, + "step": 6065 + }, + { + "epoch": 0.35, + "grad_norm": 0.25390625, + "learning_rate": 0.00016473704926867, + "loss": 0.9922, + "step": 6070 + }, + { + "epoch": 0.35, + "grad_norm": 0.2578125, + "learning_rate": 0.0001646606876866153, + "loss": 0.8736, + "step": 6075 + }, + { + "epoch": 0.35, + "grad_norm": 0.2734375, + "learning_rate": 0.00016458426125440974, + "loss": 0.9582, + "step": 6080 + }, + { + "epoch": 0.35, + "grad_norm": 0.259765625, + "learning_rate": 0.0001645077700487036, + "loss": 1.0029, + "step": 6085 + }, + { + "epoch": 0.35, + "grad_norm": 0.255859375, + "learning_rate": 0.00016443121414621236, + "loss": 0.9921, + "step": 6090 + }, + { + "epoch": 0.35, + "grad_norm": 0.25390625, + "learning_rate": 0.00016435459362371612, + "loss": 0.9999, + "step": 6095 + }, + { + "epoch": 0.35, + "grad_norm": 0.283203125, + "learning_rate": 0.00016427790855805995, + "loss": 0.9272, + "step": 6100 + }, + { + "epoch": 0.35, + "grad_norm": 0.251953125, + "learning_rate": 0.00016420115902615365, + "loss": 0.9221, + "step": 6105 + }, + { + "epoch": 0.35, + "grad_norm": 0.2734375, + "learning_rate": 0.00016412434510497157, + "loss": 1.0148, + "step": 6110 + }, + { + "epoch": 0.35, + "grad_norm": 0.259765625, + "learning_rate": 0.00016404746687155277, + "loss": 1.0202, + "step": 6115 + }, + { + "epoch": 0.35, + "grad_norm": 0.251953125, + "learning_rate": 0.00016397052440300067, + "loss": 0.9146, + "step": 6120 + }, + { + "epoch": 0.35, + "grad_norm": 0.263671875, + "learning_rate": 0.00016389351777648325, + "loss": 0.9159, + "step": 6125 + }, + { + "epoch": 0.35, + "grad_norm": 0.263671875, + "learning_rate": 0.00016381644706923277, + "loss": 0.9181, + "step": 6130 + }, + { + "epoch": 0.35, + "grad_norm": 0.251953125, + "learning_rate": 0.00016373931235854573, + "loss": 0.9705, + "step": 6135 + }, + { + "epoch": 0.35, + "grad_norm": 0.2578125, + "learning_rate": 0.00016366211372178285, + "loss": 0.9669, + "step": 6140 + }, + { + "epoch": 0.35, + "grad_norm": 0.267578125, + "learning_rate": 0.00016358485123636903, + "loss": 0.9533, + "step": 6145 + }, + { + "epoch": 0.35, + "grad_norm": 0.2578125, + "learning_rate": 0.00016350752497979308, + "loss": 0.9237, + "step": 6150 + }, + { + "epoch": 0.35, + "grad_norm": 0.251953125, + "learning_rate": 0.00016343013502960786, + "loss": 0.9883, + "step": 6155 + }, + { + "epoch": 0.35, + "grad_norm": 0.3046875, + "learning_rate": 0.00016335268146343008, + "loss": 0.9447, + "step": 6160 + }, + { + "epoch": 0.35, + "grad_norm": 0.255859375, + "learning_rate": 0.00016327516435894025, + "loss": 0.9629, + "step": 6165 + }, + { + "epoch": 0.35, + "grad_norm": 0.291015625, + "learning_rate": 0.0001631975837938826, + "loss": 1.0215, + "step": 6170 + }, + { + "epoch": 0.35, + "grad_norm": 0.271484375, + "learning_rate": 0.00016311993984606505, + "loss": 0.9506, + "step": 6175 + }, + { + "epoch": 0.35, + "grad_norm": 0.2890625, + "learning_rate": 0.00016304223259335898, + "loss": 1.0332, + "step": 6180 + }, + { + "epoch": 0.35, + "grad_norm": 0.2734375, + "learning_rate": 0.00016296446211369942, + "loss": 0.9143, + "step": 6185 + }, + { + "epoch": 0.36, + "grad_norm": 0.251953125, + "learning_rate": 0.00016288662848508467, + "loss": 0.9682, + "step": 6190 + }, + { + "epoch": 0.36, + "grad_norm": 0.26171875, + "learning_rate": 0.00016280873178557643, + "loss": 0.9127, + "step": 6195 + }, + { + "epoch": 0.36, + "grad_norm": 0.240234375, + "learning_rate": 0.00016273077209329968, + "loss": 0.8634, + "step": 6200 + }, + { + "epoch": 0.36, + "grad_norm": 0.26171875, + "learning_rate": 0.0001626527494864425, + "loss": 0.9811, + "step": 6205 + }, + { + "epoch": 0.36, + "grad_norm": 0.265625, + "learning_rate": 0.0001625746640432561, + "loss": 0.9667, + "step": 6210 + }, + { + "epoch": 0.36, + "grad_norm": 0.27734375, + "learning_rate": 0.0001624965158420548, + "loss": 0.9561, + "step": 6215 + }, + { + "epoch": 0.36, + "grad_norm": 0.26171875, + "learning_rate": 0.0001624183049612157, + "loss": 0.9321, + "step": 6220 + }, + { + "epoch": 0.36, + "grad_norm": 0.267578125, + "learning_rate": 0.00016234003147917888, + "loss": 0.9751, + "step": 6225 + }, + { + "epoch": 0.36, + "grad_norm": 0.271484375, + "learning_rate": 0.0001622616954744472, + "loss": 0.998, + "step": 6230 + }, + { + "epoch": 0.36, + "grad_norm": 0.2578125, + "learning_rate": 0.00016218329702558616, + "loss": 0.9544, + "step": 6235 + }, + { + "epoch": 0.36, + "grad_norm": 0.27734375, + "learning_rate": 0.00016210483621122395, + "loss": 0.9151, + "step": 6240 + }, + { + "epoch": 0.36, + "grad_norm": 0.26953125, + "learning_rate": 0.00016202631311005124, + "loss": 0.9362, + "step": 6245 + }, + { + "epoch": 0.36, + "grad_norm": 0.287109375, + "learning_rate": 0.00016194772780082125, + "loss": 0.976, + "step": 6250 + }, + { + "epoch": 0.36, + "grad_norm": 0.25, + "learning_rate": 0.0001618690803623496, + "loss": 0.988, + "step": 6255 + }, + { + "epoch": 0.36, + "grad_norm": 0.267578125, + "learning_rate": 0.00016179037087351406, + "loss": 0.9854, + "step": 6260 + }, + { + "epoch": 0.36, + "grad_norm": 0.271484375, + "learning_rate": 0.00016171159941325483, + "loss": 0.966, + "step": 6265 + }, + { + "epoch": 0.36, + "grad_norm": 0.26953125, + "learning_rate": 0.00016163276606057415, + "loss": 0.942, + "step": 6270 + }, + { + "epoch": 0.36, + "grad_norm": 0.30859375, + "learning_rate": 0.00016155387089453638, + "loss": 0.9386, + "step": 6275 + }, + { + "epoch": 0.36, + "grad_norm": 0.279296875, + "learning_rate": 0.00016147491399426785, + "loss": 0.9389, + "step": 6280 + }, + { + "epoch": 0.36, + "grad_norm": 0.25390625, + "learning_rate": 0.0001613958954389568, + "loss": 0.9415, + "step": 6285 + }, + { + "epoch": 0.36, + "grad_norm": 0.26953125, + "learning_rate": 0.00016131681530785335, + "loss": 0.9897, + "step": 6290 + }, + { + "epoch": 0.36, + "grad_norm": 0.2734375, + "learning_rate": 0.00016123767368026929, + "loss": 0.8812, + "step": 6295 + }, + { + "epoch": 0.36, + "grad_norm": 0.26171875, + "learning_rate": 0.0001611584706355782, + "loss": 0.9468, + "step": 6300 + }, + { + "epoch": 0.36, + "grad_norm": 0.265625, + "learning_rate": 0.0001610792062532152, + "loss": 0.9353, + "step": 6305 + }, + { + "epoch": 0.36, + "grad_norm": 0.25390625, + "learning_rate": 0.00016099988061267688, + "loss": 0.9126, + "step": 6310 + }, + { + "epoch": 0.36, + "grad_norm": 0.25390625, + "learning_rate": 0.00016092049379352132, + "loss": 0.9287, + "step": 6315 + }, + { + "epoch": 0.36, + "grad_norm": 0.2578125, + "learning_rate": 0.000160841045875368, + "loss": 0.9711, + "step": 6320 + }, + { + "epoch": 0.36, + "grad_norm": 0.2734375, + "learning_rate": 0.0001607615369378976, + "loss": 0.9874, + "step": 6325 + }, + { + "epoch": 0.36, + "grad_norm": 0.26953125, + "learning_rate": 0.00016068196706085197, + "loss": 0.9632, + "step": 6330 + }, + { + "epoch": 0.36, + "grad_norm": 0.25, + "learning_rate": 0.00016060233632403422, + "loss": 0.8828, + "step": 6335 + }, + { + "epoch": 0.36, + "grad_norm": 0.279296875, + "learning_rate": 0.0001605226448073084, + "loss": 0.9771, + "step": 6340 + }, + { + "epoch": 0.36, + "grad_norm": 0.294921875, + "learning_rate": 0.0001604428925905995, + "loss": 0.9693, + "step": 6345 + }, + { + "epoch": 0.36, + "grad_norm": 0.26171875, + "learning_rate": 0.00016036307975389344, + "loss": 0.8772, + "step": 6350 + }, + { + "epoch": 0.36, + "grad_norm": 0.255859375, + "learning_rate": 0.00016028320637723694, + "loss": 0.8691, + "step": 6355 + }, + { + "epoch": 0.36, + "grad_norm": 0.375, + "learning_rate": 0.00016020327254073736, + "loss": 0.9865, + "step": 6360 + }, + { + "epoch": 0.37, + "grad_norm": 0.28125, + "learning_rate": 0.0001601232783245628, + "loss": 0.9231, + "step": 6365 + }, + { + "epoch": 0.37, + "grad_norm": 0.2578125, + "learning_rate": 0.00016004322380894182, + "loss": 0.9726, + "step": 6370 + }, + { + "epoch": 0.37, + "grad_norm": 0.27734375, + "learning_rate": 0.00015996310907416355, + "loss": 0.9754, + "step": 6375 + }, + { + "epoch": 0.37, + "grad_norm": 0.255859375, + "learning_rate": 0.00015988293420057744, + "loss": 1.03, + "step": 6380 + }, + { + "epoch": 0.37, + "grad_norm": 0.267578125, + "learning_rate": 0.0001598026992685933, + "loss": 0.9943, + "step": 6385 + }, + { + "epoch": 0.37, + "grad_norm": 0.26171875, + "learning_rate": 0.00015972240435868117, + "loss": 0.9568, + "step": 6390 + }, + { + "epoch": 0.37, + "grad_norm": 0.279296875, + "learning_rate": 0.00015964204955137124, + "loss": 0.9483, + "step": 6395 + }, + { + "epoch": 0.37, + "grad_norm": 0.255859375, + "learning_rate": 0.00015956163492725372, + "loss": 0.9173, + "step": 6400 + }, + { + "epoch": 0.37, + "grad_norm": 0.2578125, + "learning_rate": 0.00015948116056697888, + "loss": 0.9607, + "step": 6405 + }, + { + "epoch": 0.37, + "grad_norm": 0.251953125, + "learning_rate": 0.0001594006265512569, + "loss": 0.9838, + "step": 6410 + }, + { + "epoch": 0.37, + "grad_norm": 0.271484375, + "learning_rate": 0.00015932003296085774, + "loss": 0.9236, + "step": 6415 + }, + { + "epoch": 0.37, + "grad_norm": 0.2734375, + "learning_rate": 0.00015923937987661116, + "loss": 0.9918, + "step": 6420 + }, + { + "epoch": 0.37, + "grad_norm": 0.259765625, + "learning_rate": 0.00015915866737940655, + "loss": 0.9185, + "step": 6425 + }, + { + "epoch": 0.37, + "grad_norm": 0.265625, + "learning_rate": 0.00015907789555019296, + "loss": 0.9472, + "step": 6430 + }, + { + "epoch": 0.37, + "grad_norm": 0.265625, + "learning_rate": 0.0001589970644699788, + "loss": 0.9831, + "step": 6435 + }, + { + "epoch": 0.37, + "grad_norm": 0.255859375, + "learning_rate": 0.00015891617421983205, + "loss": 0.8943, + "step": 6440 + }, + { + "epoch": 0.37, + "grad_norm": 0.265625, + "learning_rate": 0.00015883522488087994, + "loss": 0.9508, + "step": 6445 + }, + { + "epoch": 0.37, + "grad_norm": 0.259765625, + "learning_rate": 0.00015875421653430903, + "loss": 0.9309, + "step": 6450 + }, + { + "epoch": 0.37, + "grad_norm": 0.259765625, + "learning_rate": 0.00015867314926136509, + "loss": 0.9801, + "step": 6455 + }, + { + "epoch": 0.37, + "grad_norm": 0.267578125, + "learning_rate": 0.0001585920231433528, + "loss": 0.9322, + "step": 6460 + }, + { + "epoch": 0.37, + "grad_norm": 0.259765625, + "learning_rate": 0.00015851083826163607, + "loss": 0.9407, + "step": 6465 + }, + { + "epoch": 0.37, + "grad_norm": 0.271484375, + "learning_rate": 0.00015842959469763765, + "loss": 0.9791, + "step": 6470 + }, + { + "epoch": 0.37, + "grad_norm": 0.26171875, + "learning_rate": 0.00015834829253283915, + "loss": 0.9557, + "step": 6475 + }, + { + "epoch": 0.37, + "grad_norm": 0.23828125, + "learning_rate": 0.00015826693184878095, + "loss": 0.9298, + "step": 6480 + }, + { + "epoch": 0.37, + "grad_norm": 0.255859375, + "learning_rate": 0.00015818551272706217, + "loss": 0.9324, + "step": 6485 + }, + { + "epoch": 0.37, + "grad_norm": 0.287109375, + "learning_rate": 0.00015810403524934042, + "loss": 0.9836, + "step": 6490 + }, + { + "epoch": 0.37, + "grad_norm": 0.265625, + "learning_rate": 0.00015802249949733202, + "loss": 1.0264, + "step": 6495 + }, + { + "epoch": 0.37, + "grad_norm": 0.265625, + "learning_rate": 0.00015794090555281155, + "loss": 0.9183, + "step": 6500 + }, + { + "epoch": 0.37, + "grad_norm": 0.26953125, + "learning_rate": 0.00015785925349761208, + "loss": 0.9037, + "step": 6505 + }, + { + "epoch": 0.37, + "grad_norm": 0.2490234375, + "learning_rate": 0.00015777754341362487, + "loss": 0.9699, + "step": 6510 + }, + { + "epoch": 0.37, + "grad_norm": 0.259765625, + "learning_rate": 0.00015769577538279949, + "loss": 0.9243, + "step": 6515 + }, + { + "epoch": 0.37, + "grad_norm": 0.25390625, + "learning_rate": 0.0001576139494871435, + "loss": 0.9242, + "step": 6520 + }, + { + "epoch": 0.37, + "grad_norm": 0.26171875, + "learning_rate": 0.00015753206580872256, + "loss": 1.0183, + "step": 6525 + }, + { + "epoch": 0.37, + "grad_norm": 0.263671875, + "learning_rate": 0.0001574501244296603, + "loss": 0.9684, + "step": 6530 + }, + { + "epoch": 0.37, + "grad_norm": 0.248046875, + "learning_rate": 0.0001573681254321382, + "loss": 0.9501, + "step": 6535 + }, + { + "epoch": 0.38, + "grad_norm": 0.25, + "learning_rate": 0.00015728606889839553, + "loss": 0.901, + "step": 6540 + }, + { + "epoch": 0.38, + "grad_norm": 0.28125, + "learning_rate": 0.00015720395491072918, + "loss": 0.9925, + "step": 6545 + }, + { + "epoch": 0.38, + "grad_norm": 0.248046875, + "learning_rate": 0.00015712178355149385, + "loss": 0.9256, + "step": 6550 + }, + { + "epoch": 0.38, + "grad_norm": 0.263671875, + "learning_rate": 0.00015703955490310162, + "loss": 0.9016, + "step": 6555 + }, + { + "epoch": 0.38, + "grad_norm": 0.263671875, + "learning_rate": 0.00015695726904802208, + "loss": 0.9426, + "step": 6560 + }, + { + "epoch": 0.38, + "grad_norm": 0.271484375, + "learning_rate": 0.0001568749260687822, + "loss": 1.0187, + "step": 6565 + }, + { + "epoch": 0.38, + "grad_norm": 0.283203125, + "learning_rate": 0.00015679252604796623, + "loss": 0.9141, + "step": 6570 + }, + { + "epoch": 0.38, + "grad_norm": 0.25, + "learning_rate": 0.0001567100690682156, + "loss": 0.9538, + "step": 6575 + }, + { + "epoch": 0.38, + "grad_norm": 0.25390625, + "learning_rate": 0.00015662755521222895, + "loss": 0.9471, + "step": 6580 + }, + { + "epoch": 0.38, + "grad_norm": 0.26171875, + "learning_rate": 0.00015654498456276188, + "loss": 0.9182, + "step": 6585 + }, + { + "epoch": 0.38, + "grad_norm": 0.298828125, + "learning_rate": 0.000156462357202627, + "loss": 1.0089, + "step": 6590 + }, + { + "epoch": 0.38, + "grad_norm": 0.251953125, + "learning_rate": 0.0001563796732146938, + "loss": 0.9403, + "step": 6595 + }, + { + "epoch": 0.38, + "grad_norm": 0.275390625, + "learning_rate": 0.00015629693268188842, + "loss": 1.0023, + "step": 6600 + }, + { + "epoch": 0.38, + "grad_norm": 0.251953125, + "learning_rate": 0.000156214135687194, + "loss": 1.0552, + "step": 6605 + }, + { + "epoch": 0.38, + "grad_norm": 0.271484375, + "learning_rate": 0.00015613128231365002, + "loss": 0.9476, + "step": 6610 + }, + { + "epoch": 0.38, + "grad_norm": 0.2412109375, + "learning_rate": 0.00015604837264435268, + "loss": 0.9862, + "step": 6615 + }, + { + "epoch": 0.38, + "grad_norm": 0.263671875, + "learning_rate": 0.00015596540676245454, + "loss": 1.0152, + "step": 6620 + }, + { + "epoch": 0.38, + "grad_norm": 0.263671875, + "learning_rate": 0.00015588238475116464, + "loss": 0.9548, + "step": 6625 + }, + { + "epoch": 0.38, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001557993066937482, + "loss": 0.9643, + "step": 6630 + }, + { + "epoch": 0.38, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001557161726735268, + "loss": 0.9176, + "step": 6635 + }, + { + "epoch": 0.38, + "grad_norm": 0.279296875, + "learning_rate": 0.0001556329827738779, + "loss": 0.9208, + "step": 6640 + }, + { + "epoch": 0.38, + "grad_norm": 0.26171875, + "learning_rate": 0.00015554973707823525, + "loss": 0.9301, + "step": 6645 + }, + { + "epoch": 0.38, + "grad_norm": 0.251953125, + "learning_rate": 0.00015546643567008848, + "loss": 0.9482, + "step": 6650 + }, + { + "epoch": 0.38, + "grad_norm": 0.24609375, + "learning_rate": 0.00015538307863298303, + "loss": 0.9623, + "step": 6655 + }, + { + "epoch": 0.38, + "grad_norm": 0.271484375, + "learning_rate": 0.00015529966605052023, + "loss": 0.9909, + "step": 6660 + }, + { + "epoch": 0.38, + "grad_norm": 0.263671875, + "learning_rate": 0.00015521619800635704, + "loss": 1.0055, + "step": 6665 + }, + { + "epoch": 0.38, + "grad_norm": 0.275390625, + "learning_rate": 0.00015513267458420606, + "loss": 0.9519, + "step": 6670 + }, + { + "epoch": 0.38, + "grad_norm": 0.26953125, + "learning_rate": 0.0001550490958678355, + "loss": 0.994, + "step": 6675 + }, + { + "epoch": 0.38, + "grad_norm": 0.279296875, + "learning_rate": 0.00015496546194106888, + "loss": 0.9019, + "step": 6680 + }, + { + "epoch": 0.38, + "grad_norm": 0.263671875, + "learning_rate": 0.00015488177288778532, + "loss": 0.9355, + "step": 6685 + }, + { + "epoch": 0.38, + "grad_norm": 0.271484375, + "learning_rate": 0.00015479802879191898, + "loss": 0.9583, + "step": 6690 + }, + { + "epoch": 0.38, + "grad_norm": 0.265625, + "learning_rate": 0.00015471422973745936, + "loss": 0.9922, + "step": 6695 + }, + { + "epoch": 0.38, + "grad_norm": 0.30078125, + "learning_rate": 0.00015463037580845107, + "loss": 0.9591, + "step": 6700 + }, + { + "epoch": 0.38, + "grad_norm": 0.26171875, + "learning_rate": 0.00015454646708899374, + "loss": 0.9839, + "step": 6705 + }, + { + "epoch": 0.39, + "grad_norm": 0.2451171875, + "learning_rate": 0.00015446250366324196, + "loss": 0.9443, + "step": 6710 + }, + { + "epoch": 0.39, + "grad_norm": 0.255859375, + "learning_rate": 0.00015437848561540517, + "loss": 0.961, + "step": 6715 + }, + { + "epoch": 0.39, + "grad_norm": 0.287109375, + "learning_rate": 0.00015429441302974755, + "loss": 0.9097, + "step": 6720 + }, + { + "epoch": 0.39, + "grad_norm": 0.2734375, + "learning_rate": 0.00015421028599058812, + "loss": 0.8755, + "step": 6725 + }, + { + "epoch": 0.39, + "grad_norm": 0.267578125, + "learning_rate": 0.00015412610458230037, + "loss": 0.9329, + "step": 6730 + }, + { + "epoch": 0.39, + "grad_norm": 0.2578125, + "learning_rate": 0.00015404186888931233, + "loss": 0.9379, + "step": 6735 + }, + { + "epoch": 0.39, + "grad_norm": 0.26953125, + "learning_rate": 0.00015395757899610662, + "loss": 1.0126, + "step": 6740 + }, + { + "epoch": 0.39, + "grad_norm": 0.2373046875, + "learning_rate": 0.00015387323498722, + "loss": 0.9752, + "step": 6745 + }, + { + "epoch": 0.39, + "grad_norm": 0.28125, + "learning_rate": 0.00015378883694724369, + "loss": 0.9499, + "step": 6750 + }, + { + "epoch": 0.39, + "grad_norm": 0.26953125, + "learning_rate": 0.00015370438496082302, + "loss": 0.9756, + "step": 6755 + }, + { + "epoch": 0.39, + "grad_norm": 0.259765625, + "learning_rate": 0.0001536198791126574, + "loss": 0.932, + "step": 6760 + }, + { + "epoch": 0.39, + "grad_norm": 0.2578125, + "learning_rate": 0.00015353531948750026, + "loss": 0.8913, + "step": 6765 + }, + { + "epoch": 0.39, + "grad_norm": 0.25, + "learning_rate": 0.0001534507061701591, + "loss": 0.8923, + "step": 6770 + }, + { + "epoch": 0.39, + "grad_norm": 0.26953125, + "learning_rate": 0.00015336603924549503, + "loss": 1.0337, + "step": 6775 + }, + { + "epoch": 0.39, + "grad_norm": 0.25, + "learning_rate": 0.0001532813187984232, + "loss": 0.9402, + "step": 6780 + }, + { + "epoch": 0.39, + "grad_norm": 0.263671875, + "learning_rate": 0.0001531965449139122, + "loss": 0.9171, + "step": 6785 + }, + { + "epoch": 0.39, + "grad_norm": 0.275390625, + "learning_rate": 0.00015311171767698435, + "loss": 0.9178, + "step": 6790 + }, + { + "epoch": 0.39, + "grad_norm": 0.259765625, + "learning_rate": 0.0001530268371727154, + "loss": 0.9138, + "step": 6795 + }, + { + "epoch": 0.39, + "grad_norm": 0.29296875, + "learning_rate": 0.0001529419034862346, + "loss": 0.992, + "step": 6800 + }, + { + "epoch": 0.39, + "grad_norm": 0.255859375, + "learning_rate": 0.00015285691670272451, + "loss": 0.9323, + "step": 6805 + }, + { + "epoch": 0.39, + "grad_norm": 0.28125, + "learning_rate": 0.0001527718769074209, + "loss": 0.9513, + "step": 6810 + }, + { + "epoch": 0.39, + "grad_norm": 0.27734375, + "learning_rate": 0.00015268678418561276, + "loss": 0.9996, + "step": 6815 + }, + { + "epoch": 0.39, + "grad_norm": 0.296875, + "learning_rate": 0.00015260163862264217, + "loss": 0.9446, + "step": 6820 + }, + { + "epoch": 0.39, + "grad_norm": 0.26171875, + "learning_rate": 0.00015251644030390415, + "loss": 0.9033, + "step": 6825 + }, + { + "epoch": 0.39, + "grad_norm": 0.267578125, + "learning_rate": 0.00015243118931484667, + "loss": 1.0265, + "step": 6830 + }, + { + "epoch": 0.39, + "grad_norm": 0.24609375, + "learning_rate": 0.0001523458857409705, + "loss": 0.9354, + "step": 6835 + }, + { + "epoch": 0.39, + "grad_norm": 0.265625, + "learning_rate": 0.00015226052966782914, + "loss": 0.9134, + "step": 6840 + }, + { + "epoch": 0.39, + "grad_norm": 0.228515625, + "learning_rate": 0.0001521751211810288, + "loss": 0.9379, + "step": 6845 + }, + { + "epoch": 0.39, + "grad_norm": 0.259765625, + "learning_rate": 0.00015208966036622825, + "loss": 0.9665, + "step": 6850 + }, + { + "epoch": 0.39, + "grad_norm": 0.26171875, + "learning_rate": 0.00015200414730913865, + "loss": 1.0182, + "step": 6855 + }, + { + "epoch": 0.39, + "grad_norm": 0.267578125, + "learning_rate": 0.00015191858209552368, + "loss": 1.046, + "step": 6860 + }, + { + "epoch": 0.39, + "grad_norm": 0.26953125, + "learning_rate": 0.00015183296481119924, + "loss": 1.0084, + "step": 6865 + }, + { + "epoch": 0.39, + "grad_norm": 0.298828125, + "learning_rate": 0.00015174729554203348, + "loss": 0.9653, + "step": 6870 + }, + { + "epoch": 0.39, + "grad_norm": 0.2578125, + "learning_rate": 0.00015166157437394672, + "loss": 0.9834, + "step": 6875 + }, + { + "epoch": 0.39, + "grad_norm": 0.28125, + "learning_rate": 0.00015157580139291124, + "loss": 1.0267, + "step": 6880 + }, + { + "epoch": 0.4, + "grad_norm": 0.26953125, + "learning_rate": 0.00015148997668495143, + "loss": 0.931, + "step": 6885 + }, + { + "epoch": 0.4, + "grad_norm": 0.251953125, + "learning_rate": 0.0001514041003361434, + "loss": 0.966, + "step": 6890 + }, + { + "epoch": 0.4, + "grad_norm": 0.259765625, + "learning_rate": 0.00015131817243261512, + "loss": 0.8916, + "step": 6895 + }, + { + "epoch": 0.4, + "grad_norm": 0.251953125, + "learning_rate": 0.00015123219306054634, + "loss": 0.909, + "step": 6900 + }, + { + "epoch": 0.4, + "grad_norm": 0.26953125, + "learning_rate": 0.00015114616230616835, + "loss": 1.0168, + "step": 6905 + }, + { + "epoch": 0.4, + "grad_norm": 0.255859375, + "learning_rate": 0.00015106008025576393, + "loss": 0.8856, + "step": 6910 + }, + { + "epoch": 0.4, + "grad_norm": 0.275390625, + "learning_rate": 0.00015097394699566737, + "loss": 0.9172, + "step": 6915 + }, + { + "epoch": 0.4, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001508877626122644, + "loss": 0.9176, + "step": 6920 + }, + { + "epoch": 0.4, + "grad_norm": 0.2578125, + "learning_rate": 0.00015080152719199183, + "loss": 0.9964, + "step": 6925 + }, + { + "epoch": 0.4, + "grad_norm": 0.26171875, + "learning_rate": 0.00015071524082133778, + "loss": 0.8834, + "step": 6930 + }, + { + "epoch": 0.4, + "grad_norm": 0.28125, + "learning_rate": 0.00015062890358684148, + "loss": 1.0168, + "step": 6935 + }, + { + "epoch": 0.4, + "grad_norm": 0.267578125, + "learning_rate": 0.0001505425155750931, + "loss": 1.0014, + "step": 6940 + }, + { + "epoch": 0.4, + "grad_norm": 0.271484375, + "learning_rate": 0.00015045607687273383, + "loss": 0.9284, + "step": 6945 + }, + { + "epoch": 0.4, + "grad_norm": 0.248046875, + "learning_rate": 0.00015036958756645564, + "loss": 1.0146, + "step": 6950 + }, + { + "epoch": 0.4, + "grad_norm": 0.259765625, + "learning_rate": 0.00015028304774300123, + "loss": 0.8889, + "step": 6955 + }, + { + "epoch": 0.4, + "grad_norm": 0.27734375, + "learning_rate": 0.00015019645748916402, + "loss": 0.9944, + "step": 6960 + }, + { + "epoch": 0.4, + "grad_norm": 0.2470703125, + "learning_rate": 0.00015010981689178796, + "loss": 0.9689, + "step": 6965 + }, + { + "epoch": 0.4, + "grad_norm": 0.29296875, + "learning_rate": 0.00015002312603776754, + "loss": 0.8948, + "step": 6970 + }, + { + "epoch": 0.4, + "grad_norm": 0.263671875, + "learning_rate": 0.00014993638501404762, + "loss": 0.9505, + "step": 6975 + }, + { + "epoch": 0.4, + "grad_norm": 0.25390625, + "learning_rate": 0.00014984959390762335, + "loss": 1.0177, + "step": 6980 + }, + { + "epoch": 0.4, + "grad_norm": 0.2578125, + "learning_rate": 0.00014976275280554016, + "loss": 0.9758, + "step": 6985 + }, + { + "epoch": 0.4, + "grad_norm": 0.25390625, + "learning_rate": 0.00014967586179489366, + "loss": 1.0012, + "step": 6990 + }, + { + "epoch": 0.4, + "grad_norm": 0.267578125, + "learning_rate": 0.0001495889209628294, + "loss": 0.944, + "step": 6995 + }, + { + "epoch": 0.4, + "grad_norm": 0.271484375, + "learning_rate": 0.00014950193039654297, + "loss": 0.9734, + "step": 7000 + }, + { + "epoch": 0.4, + "grad_norm": 0.275390625, + "learning_rate": 0.00014941489018327988, + "loss": 0.9297, + "step": 7005 + }, + { + "epoch": 0.4, + "grad_norm": 0.279296875, + "learning_rate": 0.0001493278004103353, + "loss": 0.9897, + "step": 7010 + }, + { + "epoch": 0.4, + "grad_norm": 0.25390625, + "learning_rate": 0.00014924066116505427, + "loss": 0.9069, + "step": 7015 + }, + { + "epoch": 0.4, + "grad_norm": 0.251953125, + "learning_rate": 0.00014915347253483126, + "loss": 1.0033, + "step": 7020 + }, + { + "epoch": 0.4, + "grad_norm": 0.294921875, + "learning_rate": 0.00014906623460711046, + "loss": 0.9473, + "step": 7025 + }, + { + "epoch": 0.4, + "grad_norm": 0.244140625, + "learning_rate": 0.00014897894746938536, + "loss": 0.8782, + "step": 7030 + }, + { + "epoch": 0.4, + "grad_norm": 0.27734375, + "learning_rate": 0.00014889161120919893, + "loss": 0.9315, + "step": 7035 + }, + { + "epoch": 0.4, + "grad_norm": 0.263671875, + "learning_rate": 0.00014880422591414323, + "loss": 0.9325, + "step": 7040 + }, + { + "epoch": 0.4, + "grad_norm": 0.25390625, + "learning_rate": 0.00014871679167185973, + "loss": 0.9548, + "step": 7045 + }, + { + "epoch": 0.4, + "grad_norm": 0.25390625, + "learning_rate": 0.00014862930857003877, + "loss": 0.9158, + "step": 7050 + }, + { + "epoch": 0.4, + "grad_norm": 0.2734375, + "learning_rate": 0.00014854177669641983, + "loss": 0.9412, + "step": 7055 + }, + { + "epoch": 0.41, + "grad_norm": 0.2578125, + "learning_rate": 0.0001484541961387912, + "loss": 0.998, + "step": 7060 + }, + { + "epoch": 0.41, + "grad_norm": 0.263671875, + "learning_rate": 0.00014836656698499016, + "loss": 0.9125, + "step": 7065 + }, + { + "epoch": 0.41, + "grad_norm": 0.259765625, + "learning_rate": 0.00014827888932290257, + "loss": 0.9262, + "step": 7070 + }, + { + "epoch": 0.41, + "grad_norm": 0.26171875, + "learning_rate": 0.000148191163240463, + "loss": 0.9809, + "step": 7075 + }, + { + "epoch": 0.41, + "grad_norm": 0.26171875, + "learning_rate": 0.00014810338882565455, + "loss": 0.9246, + "step": 7080 + }, + { + "epoch": 0.41, + "grad_norm": 0.251953125, + "learning_rate": 0.00014801556616650887, + "loss": 0.9429, + "step": 7085 + }, + { + "epoch": 0.41, + "grad_norm": 0.255859375, + "learning_rate": 0.00014792769535110597, + "loss": 0.895, + "step": 7090 + }, + { + "epoch": 0.41, + "grad_norm": 0.283203125, + "learning_rate": 0.00014783977646757403, + "loss": 0.9291, + "step": 7095 + }, + { + "epoch": 0.41, + "grad_norm": 0.2470703125, + "learning_rate": 0.00014775180960408966, + "loss": 0.9427, + "step": 7100 + }, + { + "epoch": 0.41, + "grad_norm": 0.28125, + "learning_rate": 0.00014766379484887744, + "loss": 1.0229, + "step": 7105 + }, + { + "epoch": 0.41, + "grad_norm": 0.2578125, + "learning_rate": 0.00014757573229021002, + "loss": 1.005, + "step": 7110 + }, + { + "epoch": 0.41, + "grad_norm": 0.259765625, + "learning_rate": 0.00014748762201640796, + "loss": 0.8976, + "step": 7115 + }, + { + "epoch": 0.41, + "grad_norm": 0.267578125, + "learning_rate": 0.00014739946411583977, + "loss": 0.9826, + "step": 7120 + }, + { + "epoch": 0.41, + "grad_norm": 0.265625, + "learning_rate": 0.00014731125867692158, + "loss": 0.9575, + "step": 7125 + }, + { + "epoch": 0.41, + "grad_norm": 0.287109375, + "learning_rate": 0.00014722300578811734, + "loss": 0.9073, + "step": 7130 + }, + { + "epoch": 0.41, + "grad_norm": 0.259765625, + "learning_rate": 0.00014713470553793853, + "loss": 0.9769, + "step": 7135 + }, + { + "epoch": 0.41, + "grad_norm": 0.271484375, + "learning_rate": 0.0001470463580149441, + "loss": 0.9957, + "step": 7140 + }, + { + "epoch": 0.41, + "grad_norm": 0.2451171875, + "learning_rate": 0.00014695796330774048, + "loss": 0.9545, + "step": 7145 + }, + { + "epoch": 0.41, + "grad_norm": 0.263671875, + "learning_rate": 0.00014686952150498134, + "loss": 0.9559, + "step": 7150 + }, + { + "epoch": 0.41, + "grad_norm": 0.2734375, + "learning_rate": 0.00014678103269536762, + "loss": 0.9678, + "step": 7155 + }, + { + "epoch": 0.41, + "grad_norm": 0.2412109375, + "learning_rate": 0.00014669249696764748, + "loss": 0.9252, + "step": 7160 + }, + { + "epoch": 0.41, + "grad_norm": 0.267578125, + "learning_rate": 0.00014660391441061603, + "loss": 0.9327, + "step": 7165 + }, + { + "epoch": 0.41, + "grad_norm": 0.27734375, + "learning_rate": 0.00014651528511311538, + "loss": 0.9919, + "step": 7170 + }, + { + "epoch": 0.41, + "grad_norm": 0.263671875, + "learning_rate": 0.0001464266091640345, + "loss": 0.9298, + "step": 7175 + }, + { + "epoch": 0.41, + "grad_norm": 0.2578125, + "learning_rate": 0.0001463378866523092, + "loss": 0.9694, + "step": 7180 + }, + { + "epoch": 0.41, + "grad_norm": 0.287109375, + "learning_rate": 0.00014624911766692196, + "loss": 0.9764, + "step": 7185 + }, + { + "epoch": 0.41, + "grad_norm": 0.2490234375, + "learning_rate": 0.00014616030229690186, + "loss": 0.9309, + "step": 7190 + }, + { + "epoch": 0.41, + "grad_norm": 0.28515625, + "learning_rate": 0.0001460714406313245, + "loss": 0.917, + "step": 7195 + }, + { + "epoch": 0.41, + "grad_norm": 0.263671875, + "learning_rate": 0.00014598253275931197, + "loss": 0.9378, + "step": 7200 + }, + { + "epoch": 0.41, + "grad_norm": 0.296875, + "learning_rate": 0.00014589357877003257, + "loss": 0.9691, + "step": 7205 + }, + { + "epoch": 0.41, + "grad_norm": 0.2578125, + "learning_rate": 0.00014580457875270098, + "loss": 0.9381, + "step": 7210 + }, + { + "epoch": 0.41, + "grad_norm": 0.27734375, + "learning_rate": 0.00014571553279657803, + "loss": 0.9212, + "step": 7215 + }, + { + "epoch": 0.41, + "grad_norm": 0.2431640625, + "learning_rate": 0.00014562644099097048, + "loss": 1.0157, + "step": 7220 + }, + { + "epoch": 0.41, + "grad_norm": 0.271484375, + "learning_rate": 0.00014553730342523134, + "loss": 0.9484, + "step": 7225 + }, + { + "epoch": 0.41, + "grad_norm": 0.248046875, + "learning_rate": 0.0001454481201887592, + "loss": 0.9287, + "step": 7230 + }, + { + "epoch": 0.42, + "grad_norm": 0.271484375, + "learning_rate": 0.00014535889137099877, + "loss": 0.9912, + "step": 7235 + }, + { + "epoch": 0.42, + "grad_norm": 0.251953125, + "learning_rate": 0.00014526961706144023, + "loss": 0.9336, + "step": 7240 + }, + { + "epoch": 0.42, + "grad_norm": 0.2578125, + "learning_rate": 0.00014518029734961947, + "loss": 0.9718, + "step": 7245 + }, + { + "epoch": 0.42, + "grad_norm": 0.267578125, + "learning_rate": 0.00014509093232511791, + "loss": 0.9316, + "step": 7250 + }, + { + "epoch": 0.42, + "grad_norm": 0.248046875, + "learning_rate": 0.00014500152207756246, + "loss": 0.9038, + "step": 7255 + }, + { + "epoch": 0.42, + "grad_norm": 0.2734375, + "learning_rate": 0.00014491206669662533, + "loss": 1.002, + "step": 7260 + }, + { + "epoch": 0.42, + "grad_norm": 0.244140625, + "learning_rate": 0.00014482256627202405, + "loss": 0.9443, + "step": 7265 + }, + { + "epoch": 0.42, + "grad_norm": 0.263671875, + "learning_rate": 0.00014473302089352123, + "loss": 0.8925, + "step": 7270 + }, + { + "epoch": 0.42, + "grad_norm": 0.255859375, + "learning_rate": 0.00014464343065092466, + "loss": 0.9866, + "step": 7275 + }, + { + "epoch": 0.42, + "grad_norm": 0.25390625, + "learning_rate": 0.00014455379563408713, + "loss": 0.9338, + "step": 7280 + }, + { + "epoch": 0.42, + "grad_norm": 0.25, + "learning_rate": 0.00014446411593290625, + "loss": 0.879, + "step": 7285 + }, + { + "epoch": 0.42, + "grad_norm": 0.263671875, + "learning_rate": 0.0001443743916373245, + "loss": 0.9915, + "step": 7290 + }, + { + "epoch": 0.42, + "grad_norm": 0.267578125, + "learning_rate": 0.00014428462283732908, + "loss": 0.9512, + "step": 7295 + }, + { + "epoch": 0.42, + "grad_norm": 0.265625, + "learning_rate": 0.0001441948096229518, + "loss": 0.9097, + "step": 7300 + }, + { + "epoch": 0.42, + "grad_norm": 0.255859375, + "learning_rate": 0.00014410495208426908, + "loss": 0.9026, + "step": 7305 + }, + { + "epoch": 0.42, + "grad_norm": 0.259765625, + "learning_rate": 0.00014401505031140171, + "loss": 0.9441, + "step": 7310 + }, + { + "epoch": 0.42, + "grad_norm": 0.2490234375, + "learning_rate": 0.00014392510439451494, + "loss": 0.9621, + "step": 7315 + }, + { + "epoch": 0.42, + "grad_norm": 0.255859375, + "learning_rate": 0.00014383511442381822, + "loss": 0.977, + "step": 7320 + }, + { + "epoch": 0.42, + "grad_norm": 0.275390625, + "learning_rate": 0.00014374508048956515, + "loss": 0.9762, + "step": 7325 + }, + { + "epoch": 0.42, + "grad_norm": 0.26953125, + "learning_rate": 0.00014365500268205352, + "loss": 0.971, + "step": 7330 + }, + { + "epoch": 0.42, + "grad_norm": 0.265625, + "learning_rate": 0.0001435648810916251, + "loss": 0.9624, + "step": 7335 + }, + { + "epoch": 0.42, + "grad_norm": 0.3046875, + "learning_rate": 0.0001434747158086655, + "loss": 0.9771, + "step": 7340 + }, + { + "epoch": 0.42, + "grad_norm": 0.2578125, + "learning_rate": 0.00014338450692360418, + "loss": 0.96, + "step": 7345 + }, + { + "epoch": 0.42, + "grad_norm": 0.26953125, + "learning_rate": 0.0001432942545269144, + "loss": 0.9897, + "step": 7350 + }, + { + "epoch": 0.42, + "grad_norm": 0.265625, + "learning_rate": 0.000143203958709113, + "loss": 0.9156, + "step": 7355 + }, + { + "epoch": 0.42, + "grad_norm": 0.259765625, + "learning_rate": 0.00014311361956076036, + "loss": 0.9731, + "step": 7360 + }, + { + "epoch": 0.42, + "grad_norm": 0.265625, + "learning_rate": 0.00014302323717246032, + "loss": 0.9721, + "step": 7365 + }, + { + "epoch": 0.42, + "grad_norm": 0.2734375, + "learning_rate": 0.0001429328116348601, + "loss": 0.9479, + "step": 7370 + }, + { + "epoch": 0.42, + "grad_norm": 0.26953125, + "learning_rate": 0.00014284234303865026, + "loss": 0.9593, + "step": 7375 + }, + { + "epoch": 0.42, + "grad_norm": 0.271484375, + "learning_rate": 0.0001427518314745644, + "loss": 0.9217, + "step": 7380 + }, + { + "epoch": 0.42, + "grad_norm": 0.267578125, + "learning_rate": 0.0001426612770333793, + "loss": 0.9589, + "step": 7385 + }, + { + "epoch": 0.42, + "grad_norm": 0.26953125, + "learning_rate": 0.00014257067980591475, + "loss": 0.982, + "step": 7390 + }, + { + "epoch": 0.42, + "grad_norm": 0.267578125, + "learning_rate": 0.00014248003988303346, + "loss": 0.9983, + "step": 7395 + }, + { + "epoch": 0.42, + "grad_norm": 0.255859375, + "learning_rate": 0.00014238935735564094, + "loss": 1.0515, + "step": 7400 + }, + { + "epoch": 0.42, + "grad_norm": 0.2578125, + "learning_rate": 0.00014229863231468538, + "loss": 0.9091, + "step": 7405 + }, + { + "epoch": 0.43, + "grad_norm": 0.28515625, + "learning_rate": 0.00014220786485115772, + "loss": 0.9742, + "step": 7410 + }, + { + "epoch": 0.43, + "grad_norm": 0.263671875, + "learning_rate": 0.0001421170550560913, + "loss": 0.915, + "step": 7415 + }, + { + "epoch": 0.43, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001420262030205621, + "loss": 1.0453, + "step": 7420 + }, + { + "epoch": 0.43, + "grad_norm": 0.28515625, + "learning_rate": 0.0001419353088356883, + "loss": 1.0303, + "step": 7425 + }, + { + "epoch": 0.43, + "grad_norm": 0.263671875, + "learning_rate": 0.00014184437259263038, + "loss": 0.9718, + "step": 7430 + }, + { + "epoch": 0.43, + "grad_norm": 0.26171875, + "learning_rate": 0.00014175339438259112, + "loss": 0.9389, + "step": 7435 + }, + { + "epoch": 0.43, + "grad_norm": 0.26171875, + "learning_rate": 0.00014166237429681525, + "loss": 0.9127, + "step": 7440 + }, + { + "epoch": 0.43, + "grad_norm": 0.263671875, + "learning_rate": 0.00014157131242658957, + "loss": 0.9791, + "step": 7445 + }, + { + "epoch": 0.43, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001414802088632428, + "loss": 0.8747, + "step": 7450 + }, + { + "epoch": 0.43, + "grad_norm": 0.275390625, + "learning_rate": 0.00014138906369814538, + "loss": 0.8839, + "step": 7455 + }, + { + "epoch": 0.43, + "grad_norm": 0.2734375, + "learning_rate": 0.0001412978770227096, + "loss": 0.9465, + "step": 7460 + }, + { + "epoch": 0.43, + "grad_norm": 0.267578125, + "learning_rate": 0.00014120664892838933, + "loss": 0.9703, + "step": 7465 + }, + { + "epoch": 0.43, + "grad_norm": 0.30078125, + "learning_rate": 0.0001411153795066799, + "loss": 0.9488, + "step": 7470 + }, + { + "epoch": 0.43, + "grad_norm": 0.265625, + "learning_rate": 0.00014102406884911826, + "loss": 0.9921, + "step": 7475 + }, + { + "epoch": 0.43, + "grad_norm": 0.259765625, + "learning_rate": 0.00014093271704728252, + "loss": 0.9758, + "step": 7480 + }, + { + "epoch": 0.43, + "grad_norm": 0.28125, + "learning_rate": 0.00014084132419279224, + "loss": 0.9741, + "step": 7485 + }, + { + "epoch": 0.43, + "grad_norm": 0.279296875, + "learning_rate": 0.00014074989037730798, + "loss": 0.9938, + "step": 7490 + }, + { + "epoch": 0.43, + "grad_norm": 0.259765625, + "learning_rate": 0.00014065841569253155, + "loss": 0.9672, + "step": 7495 + }, + { + "epoch": 0.43, + "grad_norm": 0.3046875, + "learning_rate": 0.00014056690023020566, + "loss": 0.9742, + "step": 7500 + }, + { + "epoch": 0.43, + "grad_norm": 0.25390625, + "learning_rate": 0.00014047534408211383, + "loss": 0.9657, + "step": 7505 + }, + { + "epoch": 0.43, + "grad_norm": 0.263671875, + "learning_rate": 0.00014038374734008058, + "loss": 0.9535, + "step": 7510 + }, + { + "epoch": 0.43, + "grad_norm": 0.248046875, + "learning_rate": 0.00014029211009597097, + "loss": 0.9175, + "step": 7515 + }, + { + "epoch": 0.43, + "grad_norm": 0.275390625, + "learning_rate": 0.00014020043244169082, + "loss": 0.9896, + "step": 7520 + }, + { + "epoch": 0.43, + "grad_norm": 0.2470703125, + "learning_rate": 0.00014010871446918635, + "loss": 0.8928, + "step": 7525 + }, + { + "epoch": 0.43, + "grad_norm": 0.259765625, + "learning_rate": 0.00014001695627044428, + "loss": 0.8972, + "step": 7530 + }, + { + "epoch": 0.43, + "grad_norm": 0.26171875, + "learning_rate": 0.00013992515793749172, + "loss": 0.952, + "step": 7535 + }, + { + "epoch": 0.43, + "grad_norm": 0.31640625, + "learning_rate": 0.00013983331956239596, + "loss": 0.9458, + "step": 7540 + }, + { + "epoch": 0.43, + "grad_norm": 0.255859375, + "learning_rate": 0.00013974144123726442, + "loss": 0.9653, + "step": 7545 + }, + { + "epoch": 0.43, + "grad_norm": 0.24609375, + "learning_rate": 0.00013964952305424474, + "loss": 0.9689, + "step": 7550 + }, + { + "epoch": 0.43, + "grad_norm": 0.2734375, + "learning_rate": 0.00013955756510552437, + "loss": 1.0045, + "step": 7555 + }, + { + "epoch": 0.43, + "grad_norm": 0.275390625, + "learning_rate": 0.0001394655674833307, + "loss": 1.0394, + "step": 7560 + }, + { + "epoch": 0.43, + "grad_norm": 0.2578125, + "learning_rate": 0.00013937353027993092, + "loss": 0.8979, + "step": 7565 + }, + { + "epoch": 0.43, + "grad_norm": 0.267578125, + "learning_rate": 0.00013928145358763194, + "loss": 0.9154, + "step": 7570 + }, + { + "epoch": 0.43, + "grad_norm": 0.25390625, + "learning_rate": 0.00013918933749878024, + "loss": 0.9568, + "step": 7575 + }, + { + "epoch": 0.43, + "grad_norm": 0.28125, + "learning_rate": 0.00013909718210576179, + "loss": 0.9796, + "step": 7580 + }, + { + "epoch": 0.44, + "grad_norm": 0.2490234375, + "learning_rate": 0.000139004987501002, + "loss": 0.941, + "step": 7585 + }, + { + "epoch": 0.44, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001389127537769657, + "loss": 0.923, + "step": 7590 + }, + { + "epoch": 0.44, + "grad_norm": 0.263671875, + "learning_rate": 0.00013882048102615676, + "loss": 0.9264, + "step": 7595 + }, + { + "epoch": 0.44, + "grad_norm": 0.251953125, + "learning_rate": 0.00013872816934111838, + "loss": 0.9101, + "step": 7600 + }, + { + "epoch": 0.44, + "grad_norm": 0.275390625, + "learning_rate": 0.00013863581881443275, + "loss": 0.9212, + "step": 7605 + }, + { + "epoch": 0.44, + "grad_norm": 0.26953125, + "learning_rate": 0.0001385434295387209, + "loss": 0.9427, + "step": 7610 + }, + { + "epoch": 0.44, + "grad_norm": 0.265625, + "learning_rate": 0.00013845100160664287, + "loss": 0.9683, + "step": 7615 + }, + { + "epoch": 0.44, + "grad_norm": 0.267578125, + "learning_rate": 0.0001383585351108974, + "loss": 0.945, + "step": 7620 + }, + { + "epoch": 0.44, + "grad_norm": 0.259765625, + "learning_rate": 0.00013826603014422192, + "loss": 1.0078, + "step": 7625 + }, + { + "epoch": 0.44, + "grad_norm": 0.263671875, + "learning_rate": 0.0001381734867993925, + "loss": 1.0363, + "step": 7630 + }, + { + "epoch": 0.44, + "grad_norm": 0.26953125, + "learning_rate": 0.0001380809051692236, + "loss": 0.9902, + "step": 7635 + }, + { + "epoch": 0.44, + "grad_norm": 0.248046875, + "learning_rate": 0.00013798828534656812, + "loss": 1.0163, + "step": 7640 + }, + { + "epoch": 0.44, + "grad_norm": 0.26953125, + "learning_rate": 0.00013789562742431727, + "loss": 0.9083, + "step": 7645 + }, + { + "epoch": 0.44, + "grad_norm": 0.251953125, + "learning_rate": 0.00013780293149540053, + "loss": 0.9632, + "step": 7650 + }, + { + "epoch": 0.44, + "grad_norm": 0.2578125, + "learning_rate": 0.00013771019765278537, + "loss": 0.943, + "step": 7655 + }, + { + "epoch": 0.44, + "grad_norm": 0.294921875, + "learning_rate": 0.00013761742598947734, + "loss": 0.9423, + "step": 7660 + }, + { + "epoch": 0.44, + "grad_norm": 0.2890625, + "learning_rate": 0.00013752461659852, + "loss": 0.9826, + "step": 7665 + }, + { + "epoch": 0.44, + "grad_norm": 0.25390625, + "learning_rate": 0.00013743176957299464, + "loss": 0.9697, + "step": 7670 + }, + { + "epoch": 0.44, + "grad_norm": 0.2392578125, + "learning_rate": 0.00013733888500602038, + "loss": 0.9645, + "step": 7675 + }, + { + "epoch": 0.44, + "grad_norm": 0.263671875, + "learning_rate": 0.00013724596299075388, + "loss": 0.9397, + "step": 7680 + }, + { + "epoch": 0.44, + "grad_norm": 0.26953125, + "learning_rate": 0.0001371530036203895, + "loss": 0.9476, + "step": 7685 + }, + { + "epoch": 0.44, + "grad_norm": 0.2578125, + "learning_rate": 0.00013706000698815893, + "loss": 0.929, + "step": 7690 + }, + { + "epoch": 0.44, + "grad_norm": 0.26953125, + "learning_rate": 0.00013696697318733134, + "loss": 0.9219, + "step": 7695 + }, + { + "epoch": 0.44, + "grad_norm": 0.275390625, + "learning_rate": 0.00013687390231121314, + "loss": 0.9162, + "step": 7700 + }, + { + "epoch": 0.44, + "grad_norm": 0.2490234375, + "learning_rate": 0.00013678079445314783, + "loss": 0.8962, + "step": 7705 + }, + { + "epoch": 0.44, + "grad_norm": 0.259765625, + "learning_rate": 0.00013668764970651615, + "loss": 0.9636, + "step": 7710 + }, + { + "epoch": 0.44, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001365944681647358, + "loss": 0.9463, + "step": 7715 + }, + { + "epoch": 0.44, + "grad_norm": 0.28515625, + "learning_rate": 0.00013650124992126128, + "loss": 0.9982, + "step": 7720 + }, + { + "epoch": 0.44, + "grad_norm": 0.28515625, + "learning_rate": 0.00013640799506958403, + "loss": 0.9802, + "step": 7725 + }, + { + "epoch": 0.44, + "grad_norm": 0.25, + "learning_rate": 0.00013631470370323214, + "loss": 0.9338, + "step": 7730 + }, + { + "epoch": 0.44, + "grad_norm": 0.265625, + "learning_rate": 0.0001362213759157703, + "loss": 0.9789, + "step": 7735 + }, + { + "epoch": 0.44, + "grad_norm": 0.265625, + "learning_rate": 0.0001361280118007998, + "loss": 1.0204, + "step": 7740 + }, + { + "epoch": 0.44, + "grad_norm": 0.28125, + "learning_rate": 0.0001360346114519583, + "loss": 0.949, + "step": 7745 + }, + { + "epoch": 0.44, + "grad_norm": 0.29296875, + "learning_rate": 0.0001359411749629198, + "loss": 0.9641, + "step": 7750 + }, + { + "epoch": 0.44, + "grad_norm": 0.30078125, + "learning_rate": 0.0001358477024273946, + "loss": 0.9711, + "step": 7755 + }, + { + "epoch": 0.45, + "grad_norm": 0.25390625, + "learning_rate": 0.0001357541939391291, + "loss": 0.9231, + "step": 7760 + }, + { + "epoch": 0.45, + "grad_norm": 0.27734375, + "learning_rate": 0.00013566064959190583, + "loss": 1.0005, + "step": 7765 + }, + { + "epoch": 0.45, + "grad_norm": 0.279296875, + "learning_rate": 0.0001355670694795432, + "loss": 0.9869, + "step": 7770 + }, + { + "epoch": 0.45, + "grad_norm": 0.298828125, + "learning_rate": 0.0001354734536958955, + "loss": 0.9578, + "step": 7775 + }, + { + "epoch": 0.45, + "grad_norm": 0.26171875, + "learning_rate": 0.0001353798023348528, + "loss": 0.9296, + "step": 7780 + }, + { + "epoch": 0.45, + "grad_norm": 0.255859375, + "learning_rate": 0.00013528611549034096, + "loss": 0.9171, + "step": 7785 + }, + { + "epoch": 0.45, + "grad_norm": 0.259765625, + "learning_rate": 0.00013519239325632124, + "loss": 0.9803, + "step": 7790 + }, + { + "epoch": 0.45, + "grad_norm": 0.27734375, + "learning_rate": 0.00013509863572679057, + "loss": 0.9256, + "step": 7795 + }, + { + "epoch": 0.45, + "grad_norm": 0.26953125, + "learning_rate": 0.00013500484299578116, + "loss": 0.968, + "step": 7800 + }, + { + "epoch": 0.45, + "grad_norm": 0.26171875, + "learning_rate": 0.00013491101515736057, + "loss": 0.9171, + "step": 7805 + }, + { + "epoch": 0.45, + "grad_norm": 0.265625, + "learning_rate": 0.00013481715230563153, + "loss": 1.0461, + "step": 7810 + }, + { + "epoch": 0.45, + "grad_norm": 0.279296875, + "learning_rate": 0.00013472325453473197, + "loss": 0.9204, + "step": 7815 + }, + { + "epoch": 0.45, + "grad_norm": 0.26953125, + "learning_rate": 0.00013462932193883482, + "loss": 0.9476, + "step": 7820 + }, + { + "epoch": 0.45, + "grad_norm": 0.244140625, + "learning_rate": 0.0001345353546121478, + "loss": 0.8913, + "step": 7825 + }, + { + "epoch": 0.45, + "grad_norm": 0.259765625, + "learning_rate": 0.00013444135264891371, + "loss": 0.9357, + "step": 7830 + }, + { + "epoch": 0.45, + "grad_norm": 0.265625, + "learning_rate": 0.00013434731614340984, + "loss": 0.8973, + "step": 7835 + }, + { + "epoch": 0.45, + "grad_norm": 0.265625, + "learning_rate": 0.00013425324518994826, + "loss": 0.9029, + "step": 7840 + }, + { + "epoch": 0.45, + "grad_norm": 0.25390625, + "learning_rate": 0.0001341591398828756, + "loss": 1.0038, + "step": 7845 + }, + { + "epoch": 0.45, + "grad_norm": 0.251953125, + "learning_rate": 0.00013406500031657283, + "loss": 1.0186, + "step": 7850 + }, + { + "epoch": 0.45, + "grad_norm": 0.27734375, + "learning_rate": 0.00013397082658545543, + "loss": 0.9331, + "step": 7855 + }, + { + "epoch": 0.45, + "grad_norm": 0.255859375, + "learning_rate": 0.00013387661878397307, + "loss": 0.9009, + "step": 7860 + }, + { + "epoch": 0.45, + "grad_norm": 0.259765625, + "learning_rate": 0.00013378237700660957, + "loss": 0.9224, + "step": 7865 + }, + { + "epoch": 0.45, + "grad_norm": 0.2578125, + "learning_rate": 0.00013368810134788278, + "loss": 0.972, + "step": 7870 + }, + { + "epoch": 0.45, + "grad_norm": 0.251953125, + "learning_rate": 0.00013359379190234472, + "loss": 0.8709, + "step": 7875 + }, + { + "epoch": 0.45, + "grad_norm": 0.2421875, + "learning_rate": 0.00013349944876458108, + "loss": 0.9213, + "step": 7880 + }, + { + "epoch": 0.45, + "grad_norm": 0.27734375, + "learning_rate": 0.00013340507202921152, + "loss": 0.9116, + "step": 7885 + }, + { + "epoch": 0.45, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001333106617908892, + "loss": 0.9288, + "step": 7890 + }, + { + "epoch": 0.45, + "grad_norm": 0.25390625, + "learning_rate": 0.00013321621814430106, + "loss": 0.996, + "step": 7895 + }, + { + "epoch": 0.45, + "grad_norm": 0.2734375, + "learning_rate": 0.0001331217411841675, + "loss": 0.96, + "step": 7900 + }, + { + "epoch": 0.45, + "grad_norm": 0.314453125, + "learning_rate": 0.00013302723100524222, + "loss": 0.9032, + "step": 7905 + }, + { + "epoch": 0.45, + "grad_norm": 0.25, + "learning_rate": 0.0001329326877023124, + "loss": 0.9752, + "step": 7910 + }, + { + "epoch": 0.45, + "grad_norm": 0.2578125, + "learning_rate": 0.00013283811137019836, + "loss": 0.9543, + "step": 7915 + }, + { + "epoch": 0.45, + "grad_norm": 0.263671875, + "learning_rate": 0.00013274350210375357, + "loss": 0.9367, + "step": 7920 + }, + { + "epoch": 0.45, + "grad_norm": 0.30078125, + "learning_rate": 0.00013264885999786442, + "loss": 0.9281, + "step": 7925 + }, + { + "epoch": 0.46, + "grad_norm": 0.283203125, + "learning_rate": 0.0001325541851474504, + "loss": 0.9697, + "step": 7930 + }, + { + "epoch": 0.46, + "grad_norm": 0.263671875, + "learning_rate": 0.0001324594776474638, + "loss": 0.9704, + "step": 7935 + }, + { + "epoch": 0.46, + "grad_norm": 0.265625, + "learning_rate": 0.0001323647375928895, + "loss": 0.9694, + "step": 7940 + }, + { + "epoch": 0.46, + "grad_norm": 0.2578125, + "learning_rate": 0.00013226996507874526, + "loss": 1.0088, + "step": 7945 + }, + { + "epoch": 0.46, + "grad_norm": 0.265625, + "learning_rate": 0.00013217516020008128, + "loss": 0.9742, + "step": 7950 + }, + { + "epoch": 0.46, + "grad_norm": 0.2392578125, + "learning_rate": 0.00013208032305198018, + "loss": 1.0108, + "step": 7955 + }, + { + "epoch": 0.46, + "grad_norm": 0.25, + "learning_rate": 0.00013198545372955706, + "loss": 0.9823, + "step": 7960 + }, + { + "epoch": 0.46, + "grad_norm": 0.267578125, + "learning_rate": 0.00013189055232795915, + "loss": 0.974, + "step": 7965 + }, + { + "epoch": 0.46, + "grad_norm": 0.29296875, + "learning_rate": 0.00013179561894236598, + "loss": 1.0433, + "step": 7970 + }, + { + "epoch": 0.46, + "grad_norm": 0.275390625, + "learning_rate": 0.00013170065366798907, + "loss": 0.9458, + "step": 7975 + }, + { + "epoch": 0.46, + "grad_norm": 0.27734375, + "learning_rate": 0.00013160565660007195, + "loss": 0.9387, + "step": 7980 + }, + { + "epoch": 0.46, + "grad_norm": 0.263671875, + "learning_rate": 0.00013151062783389007, + "loss": 0.8602, + "step": 7985 + }, + { + "epoch": 0.46, + "grad_norm": 0.298828125, + "learning_rate": 0.00013141556746475058, + "loss": 0.9077, + "step": 7990 + }, + { + "epoch": 0.46, + "grad_norm": 0.255859375, + "learning_rate": 0.00013132047558799242, + "loss": 0.9475, + "step": 7995 + }, + { + "epoch": 0.46, + "grad_norm": 0.65625, + "learning_rate": 0.00013122535229898613, + "loss": 0.9877, + "step": 8000 + }, + { + "epoch": 0.46, + "grad_norm": 0.287109375, + "learning_rate": 0.0001311301976931337, + "loss": 0.8818, + "step": 8005 + }, + { + "epoch": 0.46, + "grad_norm": 0.265625, + "learning_rate": 0.00013103501186586855, + "loss": 0.9365, + "step": 8010 + }, + { + "epoch": 0.46, + "grad_norm": 0.251953125, + "learning_rate": 0.00013093979491265542, + "loss": 0.9283, + "step": 8015 + }, + { + "epoch": 0.46, + "grad_norm": 0.2578125, + "learning_rate": 0.0001308445469289902, + "loss": 0.9965, + "step": 8020 + }, + { + "epoch": 0.46, + "grad_norm": 0.251953125, + "learning_rate": 0.00013074926801040005, + "loss": 0.9523, + "step": 8025 + }, + { + "epoch": 0.46, + "grad_norm": 0.2470703125, + "learning_rate": 0.000130653958252443, + "loss": 0.9938, + "step": 8030 + }, + { + "epoch": 0.46, + "grad_norm": 0.279296875, + "learning_rate": 0.0001305586177507081, + "loss": 0.954, + "step": 8035 + }, + { + "epoch": 0.46, + "grad_norm": 0.259765625, + "learning_rate": 0.00013046324660081525, + "loss": 0.9538, + "step": 8040 + }, + { + "epoch": 0.46, + "grad_norm": 0.255859375, + "learning_rate": 0.00013036784489841495, + "loss": 0.9105, + "step": 8045 + }, + { + "epoch": 0.46, + "grad_norm": 0.25, + "learning_rate": 0.00013027241273918855, + "loss": 0.9565, + "step": 8050 + }, + { + "epoch": 0.46, + "grad_norm": 0.24609375, + "learning_rate": 0.00013017695021884777, + "loss": 0.877, + "step": 8055 + }, + { + "epoch": 0.46, + "grad_norm": 0.27734375, + "learning_rate": 0.00013008145743313487, + "loss": 0.937, + "step": 8060 + }, + { + "epoch": 0.46, + "grad_norm": 0.267578125, + "learning_rate": 0.00012998593447782246, + "loss": 0.9396, + "step": 8065 + }, + { + "epoch": 0.46, + "grad_norm": 0.26171875, + "learning_rate": 0.0001298903814487133, + "loss": 0.925, + "step": 8070 + }, + { + "epoch": 0.46, + "grad_norm": 0.2578125, + "learning_rate": 0.0001297947984416405, + "loss": 0.9002, + "step": 8075 + }, + { + "epoch": 0.46, + "grad_norm": 0.30078125, + "learning_rate": 0.0001296991855524671, + "loss": 0.9851, + "step": 8080 + }, + { + "epoch": 0.46, + "grad_norm": 0.26171875, + "learning_rate": 0.0001296035428770861, + "loss": 0.9482, + "step": 8085 + }, + { + "epoch": 0.46, + "grad_norm": 0.267578125, + "learning_rate": 0.00012950787051142052, + "loss": 0.9381, + "step": 8090 + }, + { + "epoch": 0.46, + "grad_norm": 0.26171875, + "learning_rate": 0.00012941216855142298, + "loss": 0.9905, + "step": 8095 + }, + { + "epoch": 0.46, + "grad_norm": 0.25, + "learning_rate": 0.00012931643709307588, + "loss": 0.9794, + "step": 8100 + }, + { + "epoch": 0.47, + "grad_norm": 0.25390625, + "learning_rate": 0.00012922067623239117, + "loss": 0.9296, + "step": 8105 + }, + { + "epoch": 0.47, + "grad_norm": 0.26171875, + "learning_rate": 0.00012912488606541035, + "loss": 0.9246, + "step": 8110 + }, + { + "epoch": 0.47, + "grad_norm": 0.28125, + "learning_rate": 0.00012902906668820418, + "loss": 0.9497, + "step": 8115 + }, + { + "epoch": 0.47, + "grad_norm": 0.236328125, + "learning_rate": 0.00012893321819687286, + "loss": 0.9739, + "step": 8120 + }, + { + "epoch": 0.47, + "grad_norm": 0.2421875, + "learning_rate": 0.0001288373406875457, + "loss": 0.884, + "step": 8125 + }, + { + "epoch": 0.47, + "grad_norm": 0.265625, + "learning_rate": 0.00012874143425638116, + "loss": 0.9579, + "step": 8130 + }, + { + "epoch": 0.47, + "grad_norm": 0.267578125, + "learning_rate": 0.0001286454989995667, + "loss": 0.9465, + "step": 8135 + }, + { + "epoch": 0.47, + "grad_norm": 0.24609375, + "learning_rate": 0.00012854953501331863, + "loss": 1.0287, + "step": 8140 + }, + { + "epoch": 0.47, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001284535423938822, + "loss": 0.9739, + "step": 8145 + }, + { + "epoch": 0.47, + "grad_norm": 0.2734375, + "learning_rate": 0.0001283575212375312, + "loss": 0.9755, + "step": 8150 + }, + { + "epoch": 0.47, + "grad_norm": 0.251953125, + "learning_rate": 0.00012826147164056822, + "loss": 0.9555, + "step": 8155 + }, + { + "epoch": 0.47, + "grad_norm": 0.26953125, + "learning_rate": 0.00012816539369932422, + "loss": 0.9819, + "step": 8160 + }, + { + "epoch": 0.47, + "grad_norm": 0.25390625, + "learning_rate": 0.00012806928751015874, + "loss": 0.9665, + "step": 8165 + }, + { + "epoch": 0.47, + "grad_norm": 0.259765625, + "learning_rate": 0.0001279731531694595, + "loss": 0.9912, + "step": 8170 + }, + { + "epoch": 0.47, + "grad_norm": 0.263671875, + "learning_rate": 0.00012787699077364262, + "loss": 0.8946, + "step": 8175 + }, + { + "epoch": 0.47, + "grad_norm": 0.28515625, + "learning_rate": 0.00012778080041915215, + "loss": 1.0273, + "step": 8180 + }, + { + "epoch": 0.47, + "grad_norm": 0.271484375, + "learning_rate": 0.00012768458220246035, + "loss": 1.0033, + "step": 8185 + }, + { + "epoch": 0.47, + "grad_norm": 0.26171875, + "learning_rate": 0.00012758833622006737, + "loss": 0.9711, + "step": 8190 + }, + { + "epoch": 0.47, + "grad_norm": 0.26171875, + "learning_rate": 0.0001274920625685012, + "loss": 0.9333, + "step": 8195 + }, + { + "epoch": 0.47, + "grad_norm": 0.251953125, + "learning_rate": 0.0001273957613443176, + "loss": 0.9855, + "step": 8200 + }, + { + "epoch": 0.47, + "grad_norm": 0.259765625, + "learning_rate": 0.00012729943264409992, + "loss": 0.9678, + "step": 8205 + }, + { + "epoch": 0.47, + "grad_norm": 0.25390625, + "learning_rate": 0.00012720307656445914, + "loss": 0.9855, + "step": 8210 + }, + { + "epoch": 0.47, + "grad_norm": 0.25, + "learning_rate": 0.0001271066932020337, + "loss": 0.9305, + "step": 8215 + }, + { + "epoch": 0.47, + "grad_norm": 0.337890625, + "learning_rate": 0.00012701028265348934, + "loss": 0.9011, + "step": 8220 + }, + { + "epoch": 0.47, + "grad_norm": 0.265625, + "learning_rate": 0.0001269138450155191, + "loss": 0.938, + "step": 8225 + }, + { + "epoch": 0.47, + "grad_norm": 0.26953125, + "learning_rate": 0.00012681738038484324, + "loss": 0.9451, + "step": 8230 + }, + { + "epoch": 0.47, + "grad_norm": 0.255859375, + "learning_rate": 0.00012672088885820897, + "loss": 0.957, + "step": 8235 + }, + { + "epoch": 0.47, + "grad_norm": 0.287109375, + "learning_rate": 0.00012662437053239062, + "loss": 0.9565, + "step": 8240 + }, + { + "epoch": 0.47, + "grad_norm": 0.265625, + "learning_rate": 0.0001265278255041893, + "loss": 0.9258, + "step": 8245 + }, + { + "epoch": 0.47, + "grad_norm": 0.2734375, + "learning_rate": 0.0001264312538704329, + "loss": 0.9812, + "step": 8250 + }, + { + "epoch": 0.47, + "grad_norm": 0.2578125, + "learning_rate": 0.00012633465572797604, + "loss": 0.9499, + "step": 8255 + }, + { + "epoch": 0.47, + "grad_norm": 0.244140625, + "learning_rate": 0.0001262380311736999, + "loss": 0.9338, + "step": 8260 + }, + { + "epoch": 0.47, + "grad_norm": 0.28515625, + "learning_rate": 0.0001261413803045122, + "loss": 0.9084, + "step": 8265 + }, + { + "epoch": 0.47, + "grad_norm": 0.267578125, + "learning_rate": 0.00012604470321734694, + "loss": 1.024, + "step": 8270 + }, + { + "epoch": 0.47, + "grad_norm": 0.267578125, + "learning_rate": 0.00012594800000916448, + "loss": 0.996, + "step": 8275 + }, + { + "epoch": 0.48, + "grad_norm": 0.26171875, + "learning_rate": 0.00012585127077695144, + "loss": 0.9739, + "step": 8280 + }, + { + "epoch": 0.48, + "grad_norm": 0.244140625, + "learning_rate": 0.00012575451561772048, + "loss": 0.9378, + "step": 8285 + }, + { + "epoch": 0.48, + "grad_norm": 0.2470703125, + "learning_rate": 0.00012565773462851017, + "loss": 0.9997, + "step": 8290 + }, + { + "epoch": 0.48, + "grad_norm": 0.251953125, + "learning_rate": 0.00012556092790638518, + "loss": 0.9145, + "step": 8295 + }, + { + "epoch": 0.48, + "grad_norm": 0.255859375, + "learning_rate": 0.00012546409554843585, + "loss": 0.9176, + "step": 8300 + }, + { + "epoch": 0.48, + "grad_norm": 0.267578125, + "learning_rate": 0.00012536723765177826, + "loss": 0.927, + "step": 8305 + }, + { + "epoch": 0.48, + "grad_norm": 0.287109375, + "learning_rate": 0.0001252703543135541, + "loss": 1.0053, + "step": 8310 + }, + { + "epoch": 0.48, + "grad_norm": 0.2578125, + "learning_rate": 0.0001251734456309306, + "loss": 0.8807, + "step": 8315 + }, + { + "epoch": 0.48, + "grad_norm": 0.27734375, + "learning_rate": 0.00012507651170110042, + "loss": 1.0664, + "step": 8320 + }, + { + "epoch": 0.48, + "grad_norm": 0.2734375, + "learning_rate": 0.0001249795526212815, + "loss": 0.9316, + "step": 8325 + }, + { + "epoch": 0.48, + "grad_norm": 0.271484375, + "learning_rate": 0.000124882568488717, + "loss": 0.9411, + "step": 8330 + }, + { + "epoch": 0.48, + "grad_norm": 0.26953125, + "learning_rate": 0.00012478555940067528, + "loss": 0.9834, + "step": 8335 + }, + { + "epoch": 0.48, + "grad_norm": 0.2490234375, + "learning_rate": 0.00012468852545444961, + "loss": 0.9721, + "step": 8340 + }, + { + "epoch": 0.48, + "grad_norm": 0.26171875, + "learning_rate": 0.0001245914667473583, + "loss": 0.9796, + "step": 8345 + }, + { + "epoch": 0.48, + "grad_norm": 0.259765625, + "learning_rate": 0.00012449438337674447, + "loss": 0.9297, + "step": 8350 + }, + { + "epoch": 0.48, + "grad_norm": 0.259765625, + "learning_rate": 0.00012439727543997586, + "loss": 0.9277, + "step": 8355 + }, + { + "epoch": 0.48, + "grad_norm": 0.248046875, + "learning_rate": 0.00012430014303444503, + "loss": 0.8868, + "step": 8360 + }, + { + "epoch": 0.48, + "grad_norm": 0.251953125, + "learning_rate": 0.00012420298625756898, + "loss": 0.9015, + "step": 8365 + }, + { + "epoch": 0.48, + "grad_norm": 0.263671875, + "learning_rate": 0.0001241058052067892, + "loss": 1.0216, + "step": 8370 + }, + { + "epoch": 0.48, + "grad_norm": 0.25, + "learning_rate": 0.0001240085999795714, + "loss": 0.904, + "step": 8375 + }, + { + "epoch": 0.48, + "grad_norm": 0.275390625, + "learning_rate": 0.00012391137067340572, + "loss": 0.9708, + "step": 8380 + }, + { + "epoch": 0.48, + "grad_norm": 0.26171875, + "learning_rate": 0.0001238141173858063, + "loss": 0.8768, + "step": 8385 + }, + { + "epoch": 0.48, + "grad_norm": 0.263671875, + "learning_rate": 0.00012371684021431144, + "loss": 0.9909, + "step": 8390 + }, + { + "epoch": 0.48, + "grad_norm": 0.42578125, + "learning_rate": 0.00012361953925648327, + "loss": 0.9845, + "step": 8395 + }, + { + "epoch": 0.48, + "grad_norm": 0.265625, + "learning_rate": 0.0001235222146099079, + "loss": 0.9625, + "step": 8400 + }, + { + "epoch": 0.48, + "grad_norm": 0.25390625, + "learning_rate": 0.00012342486637219517, + "loss": 0.9705, + "step": 8405 + }, + { + "epoch": 0.48, + "grad_norm": 0.27734375, + "learning_rate": 0.00012332749464097855, + "loss": 0.9871, + "step": 8410 + }, + { + "epoch": 0.48, + "grad_norm": 0.255859375, + "learning_rate": 0.00012323009951391504, + "loss": 0.934, + "step": 8415 + }, + { + "epoch": 0.48, + "grad_norm": 0.28125, + "learning_rate": 0.00012313268108868518, + "loss": 0.9593, + "step": 8420 + }, + { + "epoch": 0.48, + "grad_norm": 0.263671875, + "learning_rate": 0.00012303523946299285, + "loss": 0.9758, + "step": 8425 + }, + { + "epoch": 0.48, + "grad_norm": 0.275390625, + "learning_rate": 0.00012293777473456518, + "loss": 0.9393, + "step": 8430 + }, + { + "epoch": 0.48, + "grad_norm": 0.263671875, + "learning_rate": 0.00012284028700115245, + "loss": 0.977, + "step": 8435 + }, + { + "epoch": 0.48, + "grad_norm": 0.255859375, + "learning_rate": 0.0001227427763605281, + "loss": 0.9755, + "step": 8440 + }, + { + "epoch": 0.48, + "grad_norm": 0.263671875, + "learning_rate": 0.0001226452429104884, + "loss": 0.982, + "step": 8445 + }, + { + "epoch": 0.48, + "grad_norm": 0.267578125, + "learning_rate": 0.0001225476867488527, + "loss": 1.0214, + "step": 8450 + }, + { + "epoch": 0.49, + "grad_norm": 0.294921875, + "learning_rate": 0.00012245010797346296, + "loss": 0.9816, + "step": 8455 + }, + { + "epoch": 0.49, + "grad_norm": 0.26953125, + "learning_rate": 0.00012235250668218386, + "loss": 1.0024, + "step": 8460 + }, + { + "epoch": 0.49, + "grad_norm": 0.25, + "learning_rate": 0.00012225488297290266, + "loss": 0.9436, + "step": 8465 + }, + { + "epoch": 0.49, + "grad_norm": 0.255859375, + "learning_rate": 0.00012215723694352916, + "loss": 0.9496, + "step": 8470 + }, + { + "epoch": 0.49, + "grad_norm": 0.2470703125, + "learning_rate": 0.00012205956869199549, + "loss": 0.9747, + "step": 8475 + }, + { + "epoch": 0.49, + "grad_norm": 0.2490234375, + "learning_rate": 0.00012196187831625605, + "loss": 0.9595, + "step": 8480 + }, + { + "epoch": 0.49, + "grad_norm": 0.26171875, + "learning_rate": 0.00012186416591428751, + "loss": 0.9554, + "step": 8485 + }, + { + "epoch": 0.49, + "grad_norm": 0.263671875, + "learning_rate": 0.00012176643158408853, + "loss": 0.8985, + "step": 8490 + }, + { + "epoch": 0.49, + "grad_norm": 0.25390625, + "learning_rate": 0.00012166867542367985, + "loss": 0.9213, + "step": 8495 + }, + { + "epoch": 0.49, + "grad_norm": 0.2578125, + "learning_rate": 0.00012157089753110406, + "loss": 0.9098, + "step": 8500 + }, + { + "epoch": 0.49, + "grad_norm": 0.2451171875, + "learning_rate": 0.00012147309800442555, + "loss": 1.0076, + "step": 8505 + }, + { + "epoch": 0.49, + "grad_norm": 0.248046875, + "learning_rate": 0.00012137527694173038, + "loss": 0.9378, + "step": 8510 + }, + { + "epoch": 0.49, + "grad_norm": 0.255859375, + "learning_rate": 0.00012127743444112629, + "loss": 0.9233, + "step": 8515 + }, + { + "epoch": 0.49, + "grad_norm": 0.26953125, + "learning_rate": 0.00012117957060074245, + "loss": 0.9537, + "step": 8520 + }, + { + "epoch": 0.49, + "grad_norm": 0.255859375, + "learning_rate": 0.00012108168551872944, + "loss": 0.9529, + "step": 8525 + }, + { + "epoch": 0.49, + "grad_norm": 0.27734375, + "learning_rate": 0.00012098377929325917, + "loss": 0.9558, + "step": 8530 + }, + { + "epoch": 0.49, + "grad_norm": 0.287109375, + "learning_rate": 0.00012088585202252474, + "loss": 1.0213, + "step": 8535 + }, + { + "epoch": 0.49, + "grad_norm": 0.2392578125, + "learning_rate": 0.00012078790380474037, + "loss": 0.9165, + "step": 8540 + }, + { + "epoch": 0.49, + "grad_norm": 0.294921875, + "learning_rate": 0.00012068993473814126, + "loss": 0.8604, + "step": 8545 + }, + { + "epoch": 0.49, + "grad_norm": 0.271484375, + "learning_rate": 0.00012059194492098351, + "loss": 0.879, + "step": 8550 + }, + { + "epoch": 0.49, + "grad_norm": 0.259765625, + "learning_rate": 0.00012049393445154411, + "loss": 0.9013, + "step": 8555 + }, + { + "epoch": 0.49, + "grad_norm": 0.26171875, + "learning_rate": 0.0001203959034281207, + "loss": 0.9588, + "step": 8560 + }, + { + "epoch": 0.49, + "grad_norm": 0.2470703125, + "learning_rate": 0.00012029785194903153, + "loss": 0.9096, + "step": 8565 + }, + { + "epoch": 0.49, + "grad_norm": 0.267578125, + "learning_rate": 0.00012019978011261541, + "loss": 0.9206, + "step": 8570 + }, + { + "epoch": 0.49, + "grad_norm": 0.26171875, + "learning_rate": 0.00012010168801723149, + "loss": 0.9477, + "step": 8575 + }, + { + "epoch": 0.49, + "grad_norm": 0.2451171875, + "learning_rate": 0.00012000357576125932, + "loss": 0.8503, + "step": 8580 + }, + { + "epoch": 0.49, + "grad_norm": 0.263671875, + "learning_rate": 0.00011990544344309865, + "loss": 0.9125, + "step": 8585 + }, + { + "epoch": 0.49, + "grad_norm": 0.265625, + "learning_rate": 0.00011980729116116927, + "loss": 0.9419, + "step": 8590 + }, + { + "epoch": 0.49, + "grad_norm": 0.2490234375, + "learning_rate": 0.00011970911901391113, + "loss": 0.9902, + "step": 8595 + }, + { + "epoch": 0.49, + "grad_norm": 0.24609375, + "learning_rate": 0.00011961092709978402, + "loss": 0.9452, + "step": 8600 + }, + { + "epoch": 0.49, + "grad_norm": 0.265625, + "learning_rate": 0.00011951271551726755, + "loss": 0.9609, + "step": 8605 + }, + { + "epoch": 0.49, + "grad_norm": 0.263671875, + "learning_rate": 0.00011941448436486106, + "loss": 1.0057, + "step": 8610 + }, + { + "epoch": 0.49, + "grad_norm": 0.267578125, + "learning_rate": 0.00011931623374108358, + "loss": 0.9763, + "step": 8615 + }, + { + "epoch": 0.49, + "grad_norm": 0.26171875, + "learning_rate": 0.00011921796374447356, + "loss": 0.9368, + "step": 8620 + }, + { + "epoch": 0.49, + "grad_norm": 0.25390625, + "learning_rate": 0.00011911967447358901, + "loss": 0.8984, + "step": 8625 + }, + { + "epoch": 0.5, + "grad_norm": 0.2578125, + "learning_rate": 0.00011902136602700711, + "loss": 0.9687, + "step": 8630 + }, + { + "epoch": 0.5, + "grad_norm": 0.265625, + "learning_rate": 0.00011892303850332443, + "loss": 0.9462, + "step": 8635 + }, + { + "epoch": 0.5, + "grad_norm": 0.302734375, + "learning_rate": 0.00011882469200115656, + "loss": 0.9771, + "step": 8640 + }, + { + "epoch": 0.5, + "grad_norm": 0.26171875, + "learning_rate": 0.00011872632661913823, + "loss": 1.0017, + "step": 8645 + }, + { + "epoch": 0.5, + "grad_norm": 0.275390625, + "learning_rate": 0.00011862794245592301, + "loss": 0.8716, + "step": 8650 + }, + { + "epoch": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 0.00011852953961018332, + "loss": 0.9954, + "step": 8655 + }, + { + "epoch": 0.5, + "grad_norm": 0.265625, + "learning_rate": 0.00011843111818061036, + "loss": 0.9749, + "step": 8660 + }, + { + "epoch": 0.5, + "grad_norm": 0.26171875, + "learning_rate": 0.00011833267826591394, + "loss": 0.9266, + "step": 8665 + }, + { + "epoch": 0.5, + "grad_norm": 0.275390625, + "learning_rate": 0.0001182342199648224, + "loss": 0.9462, + "step": 8670 + }, + { + "epoch": 0.5, + "grad_norm": 0.251953125, + "learning_rate": 0.00011813574337608258, + "loss": 0.9736, + "step": 8675 + }, + { + "epoch": 0.5, + "grad_norm": 0.287109375, + "learning_rate": 0.00011803724859845952, + "loss": 1.0284, + "step": 8680 + }, + { + "epoch": 0.5, + "grad_norm": 0.2431640625, + "learning_rate": 0.00011793873573073673, + "loss": 0.9386, + "step": 8685 + }, + { + "epoch": 0.5, + "grad_norm": 0.271484375, + "learning_rate": 0.00011784020487171566, + "loss": 0.9723, + "step": 8690 + }, + { + "epoch": 0.5, + "grad_norm": 0.275390625, + "learning_rate": 0.00011774165612021585, + "loss": 0.9502, + "step": 8695 + }, + { + "epoch": 0.5, + "grad_norm": 0.271484375, + "learning_rate": 0.00011764308957507488, + "loss": 0.9251, + "step": 8700 + }, + { + "epoch": 0.5, + "grad_norm": 0.265625, + "learning_rate": 0.00011754450533514807, + "loss": 0.9441, + "step": 8705 + }, + { + "epoch": 0.5, + "grad_norm": 0.2578125, + "learning_rate": 0.00011744590349930849, + "loss": 0.995, + "step": 8710 + }, + { + "epoch": 0.5, + "grad_norm": 0.4453125, + "learning_rate": 0.00011734728416644694, + "loss": 0.8906, + "step": 8715 + }, + { + "epoch": 0.5, + "grad_norm": 0.251953125, + "learning_rate": 0.00011724864743547168, + "loss": 0.8851, + "step": 8720 + }, + { + "epoch": 0.5, + "grad_norm": 0.26953125, + "learning_rate": 0.0001171499934053085, + "loss": 0.9489, + "step": 8725 + }, + { + "epoch": 0.5, + "grad_norm": 0.2578125, + "learning_rate": 0.00011705132217490047, + "loss": 0.8942, + "step": 8730 + }, + { + "epoch": 0.5, + "grad_norm": 0.265625, + "learning_rate": 0.00011695263384320794, + "loss": 0.9441, + "step": 8735 + }, + { + "epoch": 0.5, + "grad_norm": 0.2578125, + "learning_rate": 0.00011685392850920842, + "loss": 0.9637, + "step": 8740 + }, + { + "epoch": 0.5, + "grad_norm": 0.271484375, + "learning_rate": 0.00011675520627189648, + "loss": 0.9091, + "step": 8745 + }, + { + "epoch": 0.5, + "grad_norm": 0.265625, + "learning_rate": 0.00011665646723028359, + "loss": 0.9633, + "step": 8750 + }, + { + "epoch": 0.5, + "grad_norm": 0.283203125, + "learning_rate": 0.00011655771148339812, + "loss": 0.9663, + "step": 8755 + }, + { + "epoch": 0.5, + "grad_norm": 0.25390625, + "learning_rate": 0.00011645893913028514, + "loss": 0.9058, + "step": 8760 + }, + { + "epoch": 0.5, + "grad_norm": 0.251953125, + "learning_rate": 0.00011636015027000651, + "loss": 0.8779, + "step": 8765 + }, + { + "epoch": 0.5, + "grad_norm": 0.265625, + "learning_rate": 0.00011626134500164047, + "loss": 0.9222, + "step": 8770 + }, + { + "epoch": 0.5, + "grad_norm": 0.265625, + "learning_rate": 0.00011616252342428184, + "loss": 0.9287, + "step": 8775 + }, + { + "epoch": 0.5, + "grad_norm": 0.255859375, + "learning_rate": 0.00011606368563704177, + "loss": 0.9116, + "step": 8780 + }, + { + "epoch": 0.5, + "grad_norm": 0.259765625, + "learning_rate": 0.00011596483173904762, + "loss": 0.9408, + "step": 8785 + }, + { + "epoch": 0.5, + "grad_norm": 0.25390625, + "learning_rate": 0.00011586596182944293, + "loss": 0.9107, + "step": 8790 + }, + { + "epoch": 0.5, + "grad_norm": 0.271484375, + "learning_rate": 0.00011576707600738739, + "loss": 0.9608, + "step": 8795 + }, + { + "epoch": 0.5, + "grad_norm": 0.25390625, + "learning_rate": 0.00011566817437205643, + "loss": 0.9313, + "step": 8800 + }, + { + "epoch": 0.51, + "grad_norm": 0.263671875, + "learning_rate": 0.0001155692570226416, + "loss": 0.9032, + "step": 8805 + }, + { + "epoch": 0.51, + "grad_norm": 0.25390625, + "learning_rate": 0.00011547032405835005, + "loss": 0.9539, + "step": 8810 + }, + { + "epoch": 0.51, + "grad_norm": 0.259765625, + "learning_rate": 0.00011537137557840463, + "loss": 0.9214, + "step": 8815 + }, + { + "epoch": 0.51, + "grad_norm": 0.2470703125, + "learning_rate": 0.00011527241168204375, + "loss": 0.9505, + "step": 8820 + }, + { + "epoch": 0.51, + "grad_norm": 0.28125, + "learning_rate": 0.00011517343246852126, + "loss": 1.0026, + "step": 8825 + }, + { + "epoch": 0.51, + "grad_norm": 0.267578125, + "learning_rate": 0.00011507443803710643, + "loss": 1.0072, + "step": 8830 + }, + { + "epoch": 0.51, + "grad_norm": 0.25, + "learning_rate": 0.00011497542848708374, + "loss": 0.8672, + "step": 8835 + }, + { + "epoch": 0.51, + "grad_norm": 0.28515625, + "learning_rate": 0.00011487640391775283, + "loss": 0.9045, + "step": 8840 + }, + { + "epoch": 0.51, + "grad_norm": 0.248046875, + "learning_rate": 0.00011477736442842846, + "loss": 0.9884, + "step": 8845 + }, + { + "epoch": 0.51, + "grad_norm": 0.26171875, + "learning_rate": 0.00011467831011844027, + "loss": 0.955, + "step": 8850 + }, + { + "epoch": 0.51, + "grad_norm": 0.27734375, + "learning_rate": 0.00011457924108713287, + "loss": 0.9621, + "step": 8855 + }, + { + "epoch": 0.51, + "grad_norm": 0.2734375, + "learning_rate": 0.00011448015743386553, + "loss": 0.9314, + "step": 8860 + }, + { + "epoch": 0.51, + "grad_norm": 0.369140625, + "learning_rate": 0.00011438105925801224, + "loss": 0.9508, + "step": 8865 + }, + { + "epoch": 0.51, + "grad_norm": 0.275390625, + "learning_rate": 0.00011428194665896155, + "loss": 0.8861, + "step": 8870 + }, + { + "epoch": 0.51, + "grad_norm": 0.248046875, + "learning_rate": 0.00011418281973611647, + "loss": 0.9121, + "step": 8875 + }, + { + "epoch": 0.51, + "grad_norm": 0.24609375, + "learning_rate": 0.00011408367858889437, + "loss": 0.9316, + "step": 8880 + }, + { + "epoch": 0.51, + "grad_norm": 0.263671875, + "learning_rate": 0.00011398452331672689, + "loss": 1.0033, + "step": 8885 + }, + { + "epoch": 0.51, + "grad_norm": 0.25390625, + "learning_rate": 0.00011388535401905985, + "loss": 0.8959, + "step": 8890 + }, + { + "epoch": 0.51, + "grad_norm": 0.271484375, + "learning_rate": 0.00011378617079535312, + "loss": 0.9182, + "step": 8895 + }, + { + "epoch": 0.51, + "grad_norm": 0.263671875, + "learning_rate": 0.00011368697374508052, + "loss": 0.9436, + "step": 8900 + }, + { + "epoch": 0.51, + "grad_norm": 0.255859375, + "learning_rate": 0.00011358776296772982, + "loss": 0.9697, + "step": 8905 + }, + { + "epoch": 0.51, + "grad_norm": 0.263671875, + "learning_rate": 0.00011348853856280244, + "loss": 0.9119, + "step": 8910 + }, + { + "epoch": 0.51, + "grad_norm": 0.265625, + "learning_rate": 0.00011338930062981352, + "loss": 0.9646, + "step": 8915 + }, + { + "epoch": 0.51, + "grad_norm": 0.26171875, + "learning_rate": 0.0001132900492682918, + "loss": 0.9617, + "step": 8920 + }, + { + "epoch": 0.51, + "grad_norm": 0.291015625, + "learning_rate": 0.00011319078457777947, + "loss": 0.9387, + "step": 8925 + }, + { + "epoch": 0.51, + "grad_norm": 0.2734375, + "learning_rate": 0.00011309150665783204, + "loss": 0.9628, + "step": 8930 + }, + { + "epoch": 0.51, + "grad_norm": 0.265625, + "learning_rate": 0.00011299221560801836, + "loss": 0.9331, + "step": 8935 + }, + { + "epoch": 0.51, + "grad_norm": 0.2578125, + "learning_rate": 0.00011289291152792038, + "loss": 0.9422, + "step": 8940 + }, + { + "epoch": 0.51, + "grad_norm": 0.2490234375, + "learning_rate": 0.00011279359451713318, + "loss": 0.9675, + "step": 8945 + }, + { + "epoch": 0.51, + "grad_norm": 0.255859375, + "learning_rate": 0.00011269426467526477, + "loss": 0.9467, + "step": 8950 + }, + { + "epoch": 0.51, + "grad_norm": 0.26171875, + "learning_rate": 0.00011259492210193603, + "loss": 0.9142, + "step": 8955 + }, + { + "epoch": 0.51, + "grad_norm": 0.259765625, + "learning_rate": 0.00011249556689678063, + "loss": 0.9927, + "step": 8960 + }, + { + "epoch": 0.51, + "grad_norm": 0.2578125, + "learning_rate": 0.00011239619915944488, + "loss": 0.9563, + "step": 8965 + }, + { + "epoch": 0.51, + "grad_norm": 0.255859375, + "learning_rate": 0.00011229681898958775, + "loss": 0.9659, + "step": 8970 + }, + { + "epoch": 0.51, + "grad_norm": 0.265625, + "learning_rate": 0.0001121974264868805, + "loss": 0.8972, + "step": 8975 + }, + { + "epoch": 0.52, + "grad_norm": 0.255859375, + "learning_rate": 0.00011209802175100692, + "loss": 0.882, + "step": 8980 + }, + { + "epoch": 0.52, + "grad_norm": 0.34375, + "learning_rate": 0.00011199860488166302, + "loss": 1.0123, + "step": 8985 + }, + { + "epoch": 0.52, + "grad_norm": 0.298828125, + "learning_rate": 0.00011189917597855694, + "loss": 0.984, + "step": 8990 + }, + { + "epoch": 0.52, + "grad_norm": 0.255859375, + "learning_rate": 0.00011179973514140896, + "loss": 0.9422, + "step": 8995 + }, + { + "epoch": 0.52, + "grad_norm": 0.29296875, + "learning_rate": 0.00011170028246995123, + "loss": 0.9223, + "step": 9000 + }, + { + "epoch": 0.52, + "grad_norm": 0.283203125, + "learning_rate": 0.00011160081806392788, + "loss": 1.0244, + "step": 9005 + }, + { + "epoch": 0.52, + "grad_norm": 0.26953125, + "learning_rate": 0.00011150134202309474, + "loss": 0.8835, + "step": 9010 + }, + { + "epoch": 0.52, + "grad_norm": 0.279296875, + "learning_rate": 0.00011140185444721937, + "loss": 0.9615, + "step": 9015 + }, + { + "epoch": 0.52, + "grad_norm": 0.275390625, + "learning_rate": 0.00011130235543608081, + "loss": 0.9861, + "step": 9020 + }, + { + "epoch": 0.52, + "grad_norm": 0.265625, + "learning_rate": 0.00011120284508946959, + "loss": 0.9579, + "step": 9025 + }, + { + "epoch": 0.52, + "grad_norm": 0.244140625, + "learning_rate": 0.00011110332350718768, + "loss": 0.8844, + "step": 9030 + }, + { + "epoch": 0.52, + "grad_norm": 0.28515625, + "learning_rate": 0.00011100379078904828, + "loss": 0.9529, + "step": 9035 + }, + { + "epoch": 0.52, + "grad_norm": 0.27734375, + "learning_rate": 0.00011090424703487569, + "loss": 0.9963, + "step": 9040 + }, + { + "epoch": 0.52, + "grad_norm": 0.26953125, + "learning_rate": 0.0001108046923445054, + "loss": 0.9544, + "step": 9045 + }, + { + "epoch": 0.52, + "grad_norm": 0.25, + "learning_rate": 0.00011070512681778375, + "loss": 0.9834, + "step": 9050 + }, + { + "epoch": 0.52, + "grad_norm": 0.26171875, + "learning_rate": 0.00011060555055456807, + "loss": 0.9431, + "step": 9055 + }, + { + "epoch": 0.52, + "grad_norm": 0.259765625, + "learning_rate": 0.00011050596365472637, + "loss": 0.9676, + "step": 9060 + }, + { + "epoch": 0.52, + "grad_norm": 0.26171875, + "learning_rate": 0.00011040636621813736, + "loss": 0.948, + "step": 9065 + }, + { + "epoch": 0.52, + "grad_norm": 0.259765625, + "learning_rate": 0.00011030675834469026, + "loss": 0.9499, + "step": 9070 + }, + { + "epoch": 0.52, + "grad_norm": 0.255859375, + "learning_rate": 0.00011020714013428484, + "loss": 0.9566, + "step": 9075 + }, + { + "epoch": 0.52, + "grad_norm": 0.275390625, + "learning_rate": 0.0001101075116868312, + "loss": 0.9713, + "step": 9080 + }, + { + "epoch": 0.52, + "grad_norm": 0.283203125, + "learning_rate": 0.0001100078731022497, + "loss": 0.9098, + "step": 9085 + }, + { + "epoch": 0.52, + "grad_norm": 0.234375, + "learning_rate": 0.00010990822448047089, + "loss": 0.9014, + "step": 9090 + }, + { + "epoch": 0.52, + "grad_norm": 0.255859375, + "learning_rate": 0.00010980856592143538, + "loss": 0.964, + "step": 9095 + }, + { + "epoch": 0.52, + "grad_norm": 0.263671875, + "learning_rate": 0.00010970889752509374, + "loss": 0.9408, + "step": 9100 + }, + { + "epoch": 0.52, + "grad_norm": 0.25390625, + "learning_rate": 0.00010960921939140638, + "loss": 0.9547, + "step": 9105 + }, + { + "epoch": 0.52, + "grad_norm": 0.255859375, + "learning_rate": 0.00010950953162034357, + "loss": 0.9402, + "step": 9110 + }, + { + "epoch": 0.52, + "grad_norm": 0.333984375, + "learning_rate": 0.00010940983431188508, + "loss": 0.9, + "step": 9115 + }, + { + "epoch": 0.52, + "grad_norm": 0.265625, + "learning_rate": 0.00010931012756602039, + "loss": 0.9944, + "step": 9120 + }, + { + "epoch": 0.52, + "grad_norm": 0.27734375, + "learning_rate": 0.00010921041148274838, + "loss": 0.9479, + "step": 9125 + }, + { + "epoch": 0.52, + "grad_norm": 0.259765625, + "learning_rate": 0.00010911068616207736, + "loss": 0.9428, + "step": 9130 + }, + { + "epoch": 0.52, + "grad_norm": 0.279296875, + "learning_rate": 0.00010901095170402479, + "loss": 0.9801, + "step": 9135 + }, + { + "epoch": 0.52, + "grad_norm": 0.259765625, + "learning_rate": 0.00010891120820861745, + "loss": 0.9957, + "step": 9140 + }, + { + "epoch": 0.52, + "grad_norm": 0.26171875, + "learning_rate": 0.00010881145577589103, + "loss": 0.9769, + "step": 9145 + }, + { + "epoch": 0.53, + "grad_norm": 0.279296875, + "learning_rate": 0.00010871169450589025, + "loss": 0.9695, + "step": 9150 + }, + { + "epoch": 0.53, + "grad_norm": 0.251953125, + "learning_rate": 0.00010861192449866871, + "loss": 0.9266, + "step": 9155 + }, + { + "epoch": 0.53, + "grad_norm": 0.251953125, + "learning_rate": 0.00010851214585428878, + "loss": 0.9607, + "step": 9160 + }, + { + "epoch": 0.53, + "grad_norm": 0.259765625, + "learning_rate": 0.00010841235867282137, + "loss": 0.957, + "step": 9165 + }, + { + "epoch": 0.53, + "grad_norm": 0.26953125, + "learning_rate": 0.00010831256305434616, + "loss": 0.8895, + "step": 9170 + }, + { + "epoch": 0.53, + "grad_norm": 0.26171875, + "learning_rate": 0.00010821275909895115, + "loss": 0.9494, + "step": 9175 + }, + { + "epoch": 0.53, + "grad_norm": 0.26171875, + "learning_rate": 0.00010811294690673271, + "loss": 0.9248, + "step": 9180 + }, + { + "epoch": 0.53, + "grad_norm": 0.263671875, + "learning_rate": 0.00010801312657779547, + "loss": 0.9844, + "step": 9185 + }, + { + "epoch": 0.53, + "grad_norm": 0.314453125, + "learning_rate": 0.00010791329821225232, + "loss": 0.9136, + "step": 9190 + }, + { + "epoch": 0.53, + "grad_norm": 0.26171875, + "learning_rate": 0.00010781346191022405, + "loss": 0.9677, + "step": 9195 + }, + { + "epoch": 0.53, + "grad_norm": 0.267578125, + "learning_rate": 0.00010771361777183957, + "loss": 1.0098, + "step": 9200 + }, + { + "epoch": 0.53, + "grad_norm": 0.265625, + "learning_rate": 0.00010761376589723553, + "loss": 1.0026, + "step": 9205 + }, + { + "epoch": 0.53, + "grad_norm": 0.28515625, + "learning_rate": 0.00010751390638655638, + "loss": 0.9912, + "step": 9210 + }, + { + "epoch": 0.53, + "grad_norm": 0.259765625, + "learning_rate": 0.00010741403933995424, + "loss": 0.9464, + "step": 9215 + }, + { + "epoch": 0.53, + "grad_norm": 0.251953125, + "learning_rate": 0.00010731416485758879, + "loss": 0.8838, + "step": 9220 + }, + { + "epoch": 0.53, + "grad_norm": 0.294921875, + "learning_rate": 0.00010721428303962713, + "loss": 1.0435, + "step": 9225 + }, + { + "epoch": 0.53, + "grad_norm": 0.26171875, + "learning_rate": 0.00010711439398624377, + "loss": 0.8534, + "step": 9230 + }, + { + "epoch": 0.53, + "grad_norm": 0.26171875, + "learning_rate": 0.00010701449779762046, + "loss": 0.9371, + "step": 9235 + }, + { + "epoch": 0.53, + "grad_norm": 0.265625, + "learning_rate": 0.00010691459457394604, + "loss": 0.9594, + "step": 9240 + }, + { + "epoch": 0.53, + "grad_norm": 0.271484375, + "learning_rate": 0.00010681468441541648, + "loss": 0.9701, + "step": 9245 + }, + { + "epoch": 0.53, + "grad_norm": 0.28125, + "learning_rate": 0.00010671476742223474, + "loss": 0.9005, + "step": 9250 + }, + { + "epoch": 0.53, + "grad_norm": 0.2890625, + "learning_rate": 0.00010661484369461052, + "loss": 0.9064, + "step": 9255 + }, + { + "epoch": 0.53, + "grad_norm": 0.283203125, + "learning_rate": 0.00010651491333276036, + "loss": 0.9756, + "step": 9260 + }, + { + "epoch": 0.53, + "grad_norm": 0.2578125, + "learning_rate": 0.00010641497643690743, + "loss": 0.9448, + "step": 9265 + }, + { + "epoch": 0.53, + "grad_norm": 0.265625, + "learning_rate": 0.00010631503310728146, + "loss": 0.9097, + "step": 9270 + }, + { + "epoch": 0.53, + "grad_norm": 0.263671875, + "learning_rate": 0.00010621508344411861, + "loss": 0.9309, + "step": 9275 + }, + { + "epoch": 0.53, + "grad_norm": 0.29296875, + "learning_rate": 0.0001061151275476614, + "loss": 0.89, + "step": 9280 + }, + { + "epoch": 0.53, + "grad_norm": 0.2412109375, + "learning_rate": 0.00010601516551815865, + "loss": 0.8777, + "step": 9285 + }, + { + "epoch": 0.53, + "grad_norm": 0.248046875, + "learning_rate": 0.00010591519745586522, + "loss": 0.8984, + "step": 9290 + }, + { + "epoch": 0.53, + "grad_norm": 0.265625, + "learning_rate": 0.00010581522346104215, + "loss": 0.9201, + "step": 9295 + }, + { + "epoch": 0.53, + "grad_norm": 0.255859375, + "learning_rate": 0.00010571524363395635, + "loss": 0.9349, + "step": 9300 + }, + { + "epoch": 0.53, + "grad_norm": 0.265625, + "learning_rate": 0.00010561525807488062, + "loss": 0.9182, + "step": 9305 + }, + { + "epoch": 0.53, + "grad_norm": 0.255859375, + "learning_rate": 0.00010551526688409346, + "loss": 0.9809, + "step": 9310 + }, + { + "epoch": 0.53, + "grad_norm": 0.27734375, + "learning_rate": 0.00010541527016187903, + "loss": 0.9387, + "step": 9315 + }, + { + "epoch": 0.53, + "grad_norm": 0.25, + "learning_rate": 0.00010531526800852709, + "loss": 0.9576, + "step": 9320 + }, + { + "epoch": 0.54, + "grad_norm": 0.2451171875, + "learning_rate": 0.00010521526052433282, + "loss": 0.933, + "step": 9325 + }, + { + "epoch": 0.54, + "grad_norm": 0.255859375, + "learning_rate": 0.00010511524780959667, + "loss": 0.9386, + "step": 9330 + }, + { + "epoch": 0.54, + "grad_norm": 0.291015625, + "learning_rate": 0.0001050152299646245, + "loss": 0.95, + "step": 9335 + }, + { + "epoch": 0.54, + "grad_norm": 0.255859375, + "learning_rate": 0.00010491520708972716, + "loss": 0.9805, + "step": 9340 + }, + { + "epoch": 0.54, + "grad_norm": 0.263671875, + "learning_rate": 0.0001048151792852206, + "loss": 0.8701, + "step": 9345 + }, + { + "epoch": 0.54, + "grad_norm": 0.27734375, + "learning_rate": 0.00010471514665142572, + "loss": 1.011, + "step": 9350 + }, + { + "epoch": 0.54, + "grad_norm": 0.259765625, + "learning_rate": 0.00010461510928866828, + "loss": 0.9755, + "step": 9355 + }, + { + "epoch": 0.54, + "grad_norm": 0.26953125, + "learning_rate": 0.00010451506729727875, + "loss": 0.9517, + "step": 9360 + }, + { + "epoch": 0.54, + "grad_norm": 0.255859375, + "learning_rate": 0.00010441502077759229, + "loss": 0.9555, + "step": 9365 + }, + { + "epoch": 0.54, + "grad_norm": 0.251953125, + "learning_rate": 0.00010431496982994848, + "loss": 0.9624, + "step": 9370 + }, + { + "epoch": 0.54, + "grad_norm": 0.302734375, + "learning_rate": 0.00010421491455469153, + "loss": 0.9754, + "step": 9375 + }, + { + "epoch": 0.54, + "grad_norm": 0.279296875, + "learning_rate": 0.00010411485505216984, + "loss": 1.0126, + "step": 9380 + }, + { + "epoch": 0.54, + "grad_norm": 0.275390625, + "learning_rate": 0.00010401479142273611, + "loss": 0.9281, + "step": 9385 + }, + { + "epoch": 0.54, + "grad_norm": 0.259765625, + "learning_rate": 0.00010391472376674716, + "loss": 0.9074, + "step": 9390 + }, + { + "epoch": 0.54, + "grad_norm": 0.26171875, + "learning_rate": 0.00010381465218456383, + "loss": 0.9708, + "step": 9395 + }, + { + "epoch": 0.54, + "grad_norm": 0.275390625, + "learning_rate": 0.00010371457677655096, + "loss": 0.9924, + "step": 9400 + }, + { + "epoch": 0.54, + "grad_norm": 0.24609375, + "learning_rate": 0.0001036144976430772, + "loss": 0.9766, + "step": 9405 + }, + { + "epoch": 0.54, + "grad_norm": 0.267578125, + "learning_rate": 0.00010351441488451486, + "loss": 0.8865, + "step": 9410 + }, + { + "epoch": 0.54, + "grad_norm": 0.267578125, + "learning_rate": 0.00010341432860124003, + "loss": 0.939, + "step": 9415 + }, + { + "epoch": 0.54, + "grad_norm": 0.25390625, + "learning_rate": 0.00010331423889363223, + "loss": 0.9585, + "step": 9420 + }, + { + "epoch": 0.54, + "grad_norm": 0.27734375, + "learning_rate": 0.00010321414586207443, + "loss": 0.9802, + "step": 9425 + }, + { + "epoch": 0.54, + "grad_norm": 0.255859375, + "learning_rate": 0.00010311404960695299, + "loss": 0.9173, + "step": 9430 + }, + { + "epoch": 0.54, + "grad_norm": 0.267578125, + "learning_rate": 0.00010301395022865738, + "loss": 0.96, + "step": 9435 + }, + { + "epoch": 0.54, + "grad_norm": 0.267578125, + "learning_rate": 0.00010291384782758034, + "loss": 0.9746, + "step": 9440 + }, + { + "epoch": 0.54, + "grad_norm": 0.265625, + "learning_rate": 0.00010281374250411755, + "loss": 0.9577, + "step": 9445 + }, + { + "epoch": 0.54, + "grad_norm": 0.24609375, + "learning_rate": 0.00010271363435866765, + "loss": 0.9737, + "step": 9450 + }, + { + "epoch": 0.54, + "grad_norm": 0.267578125, + "learning_rate": 0.00010261352349163218, + "loss": 0.9668, + "step": 9455 + }, + { + "epoch": 0.54, + "grad_norm": 0.263671875, + "learning_rate": 0.00010251341000341528, + "loss": 0.9797, + "step": 9460 + }, + { + "epoch": 0.54, + "grad_norm": 0.267578125, + "learning_rate": 0.00010241329399442379, + "loss": 0.9331, + "step": 9465 + }, + { + "epoch": 0.54, + "grad_norm": 0.2578125, + "learning_rate": 0.00010231317556506708, + "loss": 1.0343, + "step": 9470 + }, + { + "epoch": 0.54, + "grad_norm": 0.255859375, + "learning_rate": 0.00010221305481575696, + "loss": 0.985, + "step": 9475 + }, + { + "epoch": 0.54, + "grad_norm": 0.28515625, + "learning_rate": 0.00010211293184690751, + "loss": 0.9484, + "step": 9480 + }, + { + "epoch": 0.54, + "grad_norm": 0.251953125, + "learning_rate": 0.00010201280675893507, + "loss": 0.9083, + "step": 9485 + }, + { + "epoch": 0.54, + "grad_norm": 0.291015625, + "learning_rate": 0.00010191267965225811, + "loss": 0.9323, + "step": 9490 + }, + { + "epoch": 0.54, + "grad_norm": 0.259765625, + "learning_rate": 0.00010181255062729713, + "loss": 0.8674, + "step": 9495 + }, + { + "epoch": 0.55, + "grad_norm": 0.294921875, + "learning_rate": 0.00010171241978447455, + "loss": 0.9209, + "step": 9500 + }, + { + "epoch": 0.55, + "grad_norm": 0.259765625, + "learning_rate": 0.0001016122872242146, + "loss": 0.9076, + "step": 9505 + }, + { + "epoch": 0.55, + "grad_norm": 0.263671875, + "learning_rate": 0.00010151215304694324, + "loss": 0.92, + "step": 9510 + }, + { + "epoch": 0.55, + "grad_norm": 0.2470703125, + "learning_rate": 0.00010141201735308805, + "loss": 0.9009, + "step": 9515 + }, + { + "epoch": 0.55, + "grad_norm": 0.259765625, + "learning_rate": 0.00010131188024307817, + "loss": 0.8967, + "step": 9520 + }, + { + "epoch": 0.55, + "grad_norm": 0.279296875, + "learning_rate": 0.00010121174181734405, + "loss": 0.959, + "step": 9525 + }, + { + "epoch": 0.55, + "grad_norm": 0.28125, + "learning_rate": 0.00010111160217631756, + "loss": 0.9591, + "step": 9530 + }, + { + "epoch": 0.55, + "grad_norm": 0.267578125, + "learning_rate": 0.00010101146142043178, + "loss": 0.9943, + "step": 9535 + }, + { + "epoch": 0.55, + "grad_norm": 0.31640625, + "learning_rate": 0.0001009113196501209, + "loss": 0.893, + "step": 9540 + }, + { + "epoch": 0.55, + "grad_norm": 0.25, + "learning_rate": 0.0001008111769658201, + "loss": 0.8963, + "step": 9545 + }, + { + "epoch": 0.55, + "grad_norm": 0.248046875, + "learning_rate": 0.00010071103346796549, + "loss": 0.902, + "step": 9550 + }, + { + "epoch": 0.55, + "grad_norm": 0.25390625, + "learning_rate": 0.000100610889256994, + "loss": 0.9108, + "step": 9555 + }, + { + "epoch": 0.55, + "grad_norm": 0.2470703125, + "learning_rate": 0.00010051074443334327, + "loss": 0.9926, + "step": 9560 + }, + { + "epoch": 0.55, + "grad_norm": 0.25390625, + "learning_rate": 0.00010041059909745156, + "loss": 0.9072, + "step": 9565 + }, + { + "epoch": 0.55, + "grad_norm": 0.259765625, + "learning_rate": 0.00010031045334975768, + "loss": 0.9536, + "step": 9570 + }, + { + "epoch": 0.55, + "grad_norm": 0.255859375, + "learning_rate": 0.00010021030729070076, + "loss": 0.9899, + "step": 9575 + }, + { + "epoch": 0.55, + "grad_norm": 0.26953125, + "learning_rate": 0.00010011016102072033, + "loss": 0.904, + "step": 9580 + }, + { + "epoch": 0.55, + "grad_norm": 0.263671875, + "learning_rate": 0.0001000100146402561, + "loss": 0.9465, + "step": 9585 + }, + { + "epoch": 0.55, + "grad_norm": 0.25, + "learning_rate": 9.990986824974788e-05, + "loss": 0.862, + "step": 9590 + }, + { + "epoch": 0.55, + "grad_norm": 0.265625, + "learning_rate": 9.980972194963552e-05, + "loss": 0.9563, + "step": 9595 + }, + { + "epoch": 0.55, + "grad_norm": 0.263671875, + "learning_rate": 9.970957584035873e-05, + "loss": 0.9042, + "step": 9600 + }, + { + "epoch": 0.55, + "grad_norm": 0.2734375, + "learning_rate": 9.96094300223571e-05, + "loss": 1.0266, + "step": 9605 + }, + { + "epoch": 0.55, + "grad_norm": 0.30078125, + "learning_rate": 9.950928459606984e-05, + "loss": 0.9222, + "step": 9610 + }, + { + "epoch": 0.55, + "grad_norm": 0.251953125, + "learning_rate": 9.940913966193586e-05, + "loss": 0.8897, + "step": 9615 + }, + { + "epoch": 0.55, + "grad_norm": 0.296875, + "learning_rate": 9.930899532039347e-05, + "loss": 1.022, + "step": 9620 + }, + { + "epoch": 0.55, + "grad_norm": 0.265625, + "learning_rate": 9.920885167188054e-05, + "loss": 0.9223, + "step": 9625 + }, + { + "epoch": 0.55, + "grad_norm": 0.2470703125, + "learning_rate": 9.910870881683402e-05, + "loss": 0.9112, + "step": 9630 + }, + { + "epoch": 0.55, + "grad_norm": 0.26171875, + "learning_rate": 9.900856685569027e-05, + "loss": 0.9659, + "step": 9635 + }, + { + "epoch": 0.55, + "grad_norm": 0.25, + "learning_rate": 9.890842588888474e-05, + "loss": 0.8959, + "step": 9640 + }, + { + "epoch": 0.55, + "grad_norm": 0.2890625, + "learning_rate": 9.88082860168517e-05, + "loss": 0.9726, + "step": 9645 + }, + { + "epoch": 0.55, + "grad_norm": 0.248046875, + "learning_rate": 9.870814734002456e-05, + "loss": 1.028, + "step": 9650 + }, + { + "epoch": 0.55, + "grad_norm": 0.26171875, + "learning_rate": 9.860800995883533e-05, + "loss": 0.9658, + "step": 9655 + }, + { + "epoch": 0.55, + "grad_norm": 0.291015625, + "learning_rate": 9.850787397371482e-05, + "loss": 0.9798, + "step": 9660 + }, + { + "epoch": 0.55, + "grad_norm": 0.255859375, + "learning_rate": 9.840773948509243e-05, + "loss": 0.9827, + "step": 9665 + }, + { + "epoch": 0.55, + "grad_norm": 0.267578125, + "learning_rate": 9.83076065933961e-05, + "loss": 0.914, + "step": 9670 + }, + { + "epoch": 0.56, + "grad_norm": 0.251953125, + "learning_rate": 9.820747539905202e-05, + "loss": 1.0198, + "step": 9675 + }, + { + "epoch": 0.56, + "grad_norm": 0.26171875, + "learning_rate": 9.810734600248486e-05, + "loss": 0.882, + "step": 9680 + }, + { + "epoch": 0.56, + "grad_norm": 0.2578125, + "learning_rate": 9.800721850411743e-05, + "loss": 0.921, + "step": 9685 + }, + { + "epoch": 0.56, + "grad_norm": 0.259765625, + "learning_rate": 9.790709300437052e-05, + "loss": 0.9465, + "step": 9690 + }, + { + "epoch": 0.56, + "grad_norm": 0.25390625, + "learning_rate": 9.780696960366311e-05, + "loss": 0.9131, + "step": 9695 + }, + { + "epoch": 0.56, + "grad_norm": 0.263671875, + "learning_rate": 9.770684840241191e-05, + "loss": 0.9612, + "step": 9700 + }, + { + "epoch": 0.56, + "grad_norm": 0.255859375, + "learning_rate": 9.76067295010315e-05, + "loss": 0.8921, + "step": 9705 + }, + { + "epoch": 0.56, + "grad_norm": 0.2470703125, + "learning_rate": 9.750661299993415e-05, + "loss": 0.8779, + "step": 9710 + }, + { + "epoch": 0.56, + "grad_norm": 0.259765625, + "learning_rate": 9.740649899952967e-05, + "loss": 0.9104, + "step": 9715 + }, + { + "epoch": 0.56, + "grad_norm": 0.248046875, + "learning_rate": 9.73063876002255e-05, + "loss": 0.9074, + "step": 9720 + }, + { + "epoch": 0.56, + "grad_norm": 0.26171875, + "learning_rate": 9.720627890242628e-05, + "loss": 0.8993, + "step": 9725 + }, + { + "epoch": 0.56, + "grad_norm": 0.267578125, + "learning_rate": 9.710617300653412e-05, + "loss": 1.0271, + "step": 9730 + }, + { + "epoch": 0.56, + "grad_norm": 0.271484375, + "learning_rate": 9.700607001294814e-05, + "loss": 0.8672, + "step": 9735 + }, + { + "epoch": 0.56, + "grad_norm": 0.2578125, + "learning_rate": 9.690597002206477e-05, + "loss": 0.9625, + "step": 9740 + }, + { + "epoch": 0.56, + "grad_norm": 0.265625, + "learning_rate": 9.68058731342772e-05, + "loss": 0.9321, + "step": 9745 + }, + { + "epoch": 0.56, + "grad_norm": 0.263671875, + "learning_rate": 9.670577944997566e-05, + "loss": 0.9249, + "step": 9750 + }, + { + "epoch": 0.56, + "grad_norm": 0.255859375, + "learning_rate": 9.660568906954711e-05, + "loss": 0.9381, + "step": 9755 + }, + { + "epoch": 0.56, + "grad_norm": 0.279296875, + "learning_rate": 9.65056020933752e-05, + "loss": 0.948, + "step": 9760 + }, + { + "epoch": 0.56, + "grad_norm": 0.275390625, + "learning_rate": 9.640551862184021e-05, + "loss": 0.97, + "step": 9765 + }, + { + "epoch": 0.56, + "grad_norm": 0.296875, + "learning_rate": 9.630543875531879e-05, + "loss": 0.9537, + "step": 9770 + }, + { + "epoch": 0.56, + "grad_norm": 0.265625, + "learning_rate": 9.620536259418416e-05, + "loss": 0.8846, + "step": 9775 + }, + { + "epoch": 0.56, + "grad_norm": 0.263671875, + "learning_rate": 9.610529023880561e-05, + "loss": 1.0599, + "step": 9780 + }, + { + "epoch": 0.56, + "grad_norm": 0.251953125, + "learning_rate": 9.600522178954879e-05, + "loss": 0.9412, + "step": 9785 + }, + { + "epoch": 0.56, + "grad_norm": 0.2451171875, + "learning_rate": 9.590515734677531e-05, + "loss": 0.9605, + "step": 9790 + }, + { + "epoch": 0.56, + "grad_norm": 0.265625, + "learning_rate": 9.580509701084286e-05, + "loss": 0.9098, + "step": 9795 + }, + { + "epoch": 0.56, + "grad_norm": 0.267578125, + "learning_rate": 9.570504088210496e-05, + "loss": 0.9473, + "step": 9800 + }, + { + "epoch": 0.56, + "grad_norm": 0.25, + "learning_rate": 9.560498906091085e-05, + "loss": 0.9205, + "step": 9805 + }, + { + "epoch": 0.56, + "grad_norm": 0.26953125, + "learning_rate": 9.550494164760562e-05, + "loss": 0.9717, + "step": 9810 + }, + { + "epoch": 0.56, + "grad_norm": 0.259765625, + "learning_rate": 9.540489874252972e-05, + "loss": 0.9545, + "step": 9815 + }, + { + "epoch": 0.56, + "grad_norm": 0.263671875, + "learning_rate": 9.53048604460193e-05, + "loss": 0.988, + "step": 9820 + }, + { + "epoch": 0.56, + "grad_norm": 0.244140625, + "learning_rate": 9.52048268584057e-05, + "loss": 0.8669, + "step": 9825 + }, + { + "epoch": 0.56, + "grad_norm": 0.255859375, + "learning_rate": 9.510479808001566e-05, + "loss": 0.9643, + "step": 9830 + }, + { + "epoch": 0.56, + "grad_norm": 0.2578125, + "learning_rate": 9.500477421117102e-05, + "loss": 0.9156, + "step": 9835 + }, + { + "epoch": 0.56, + "grad_norm": 0.2451171875, + "learning_rate": 9.490475535218875e-05, + "loss": 0.9286, + "step": 9840 + }, + { + "epoch": 0.56, + "grad_norm": 0.271484375, + "learning_rate": 9.480474160338082e-05, + "loss": 0.9137, + "step": 9845 + }, + { + "epoch": 0.57, + "grad_norm": 0.322265625, + "learning_rate": 9.470473306505392e-05, + "loss": 0.8888, + "step": 9850 + }, + { + "epoch": 0.57, + "grad_norm": 0.259765625, + "learning_rate": 9.460472983750977e-05, + "loss": 0.9228, + "step": 9855 + }, + { + "epoch": 0.57, + "grad_norm": 0.279296875, + "learning_rate": 9.450473202104448e-05, + "loss": 0.9923, + "step": 9860 + }, + { + "epoch": 0.57, + "grad_norm": 0.279296875, + "learning_rate": 9.440473971594895e-05, + "loss": 0.9869, + "step": 9865 + }, + { + "epoch": 0.57, + "grad_norm": 0.251953125, + "learning_rate": 9.430475302250844e-05, + "loss": 0.9735, + "step": 9870 + }, + { + "epoch": 0.57, + "grad_norm": 0.2734375, + "learning_rate": 9.420477204100264e-05, + "loss": 0.9472, + "step": 9875 + }, + { + "epoch": 0.57, + "grad_norm": 0.259765625, + "learning_rate": 9.41047968717055e-05, + "loss": 0.9805, + "step": 9880 + }, + { + "epoch": 0.57, + "grad_norm": 0.2470703125, + "learning_rate": 9.400482761488507e-05, + "loss": 0.9235, + "step": 9885 + }, + { + "epoch": 0.57, + "grad_norm": 0.251953125, + "learning_rate": 9.390486437080361e-05, + "loss": 0.8878, + "step": 9890 + }, + { + "epoch": 0.57, + "grad_norm": 0.267578125, + "learning_rate": 9.380490723971717e-05, + "loss": 0.9191, + "step": 9895 + }, + { + "epoch": 0.57, + "grad_norm": 0.279296875, + "learning_rate": 9.370495632187587e-05, + "loss": 0.9409, + "step": 9900 + }, + { + "epoch": 0.57, + "grad_norm": 0.2734375, + "learning_rate": 9.360501171752339e-05, + "loss": 0.9563, + "step": 9905 + }, + { + "epoch": 0.57, + "grad_norm": 0.25390625, + "learning_rate": 9.35050735268973e-05, + "loss": 0.9377, + "step": 9910 + }, + { + "epoch": 0.57, + "grad_norm": 0.26171875, + "learning_rate": 9.340514185022851e-05, + "loss": 0.9467, + "step": 9915 + }, + { + "epoch": 0.57, + "grad_norm": 0.2578125, + "learning_rate": 9.330521678774157e-05, + "loss": 0.9456, + "step": 9920 + }, + { + "epoch": 0.57, + "grad_norm": 0.2470703125, + "learning_rate": 9.320529843965432e-05, + "loss": 0.904, + "step": 9925 + }, + { + "epoch": 0.57, + "grad_norm": 0.255859375, + "learning_rate": 9.310538690617788e-05, + "loss": 0.9357, + "step": 9930 + }, + { + "epoch": 0.57, + "grad_norm": 0.2451171875, + "learning_rate": 9.300548228751657e-05, + "loss": 0.9386, + "step": 9935 + }, + { + "epoch": 0.57, + "grad_norm": 0.2578125, + "learning_rate": 9.290558468386765e-05, + "loss": 0.9631, + "step": 9940 + }, + { + "epoch": 0.57, + "grad_norm": 0.31640625, + "learning_rate": 9.280569419542154e-05, + "loss": 1.0054, + "step": 9945 + }, + { + "epoch": 0.57, + "grad_norm": 0.271484375, + "learning_rate": 9.270581092236134e-05, + "loss": 0.9002, + "step": 9950 + }, + { + "epoch": 0.57, + "grad_norm": 0.263671875, + "learning_rate": 9.260593496486302e-05, + "loss": 0.9869, + "step": 9955 + }, + { + "epoch": 0.57, + "grad_norm": 0.267578125, + "learning_rate": 9.250606642309523e-05, + "loss": 0.9615, + "step": 9960 + }, + { + "epoch": 0.57, + "grad_norm": 0.255859375, + "learning_rate": 9.240620539721904e-05, + "loss": 0.9069, + "step": 9965 + }, + { + "epoch": 0.57, + "grad_norm": 0.255859375, + "learning_rate": 9.23063519873882e-05, + "loss": 0.9112, + "step": 9970 + }, + { + "epoch": 0.57, + "grad_norm": 0.263671875, + "learning_rate": 9.220650629374862e-05, + "loss": 0.9132, + "step": 9975 + }, + { + "epoch": 0.57, + "grad_norm": 0.263671875, + "learning_rate": 9.210666841643857e-05, + "loss": 0.9454, + "step": 9980 + }, + { + "epoch": 0.57, + "grad_norm": 0.244140625, + "learning_rate": 9.200683845558845e-05, + "loss": 0.9629, + "step": 9985 + }, + { + "epoch": 0.57, + "grad_norm": 0.26953125, + "learning_rate": 9.190701651132079e-05, + "loss": 0.9644, + "step": 9990 + }, + { + "epoch": 0.57, + "grad_norm": 0.255859375, + "learning_rate": 9.180720268374992e-05, + "loss": 0.9684, + "step": 9995 + }, + { + "epoch": 0.57, + "grad_norm": 0.24609375, + "learning_rate": 9.170739707298221e-05, + "loss": 0.9239, + "step": 10000 + }, + { + "epoch": 0.57, + "grad_norm": 0.28125, + "learning_rate": 9.160759977911576e-05, + "loss": 0.8766, + "step": 10005 + }, + { + "epoch": 0.57, + "grad_norm": 0.251953125, + "learning_rate": 9.150781090224015e-05, + "loss": 0.973, + "step": 10010 + }, + { + "epoch": 0.57, + "grad_norm": 0.267578125, + "learning_rate": 9.140803054243677e-05, + "loss": 0.9588, + "step": 10015 + }, + { + "epoch": 0.57, + "grad_norm": 0.271484375, + "learning_rate": 9.130825879977828e-05, + "loss": 0.9172, + "step": 10020 + }, + { + "epoch": 0.58, + "grad_norm": 0.263671875, + "learning_rate": 9.12084957743288e-05, + "loss": 0.9477, + "step": 10025 + }, + { + "epoch": 0.58, + "grad_norm": 0.251953125, + "learning_rate": 9.110874156614362e-05, + "loss": 0.9445, + "step": 10030 + }, + { + "epoch": 0.58, + "grad_norm": 0.302734375, + "learning_rate": 9.100899627526933e-05, + "loss": 0.9084, + "step": 10035 + }, + { + "epoch": 0.58, + "grad_norm": 0.25, + "learning_rate": 9.090926000174338e-05, + "loss": 0.8938, + "step": 10040 + }, + { + "epoch": 0.58, + "grad_norm": 0.2431640625, + "learning_rate": 9.080953284559433e-05, + "loss": 0.9041, + "step": 10045 + }, + { + "epoch": 0.58, + "grad_norm": 0.267578125, + "learning_rate": 9.070981490684159e-05, + "loss": 0.9502, + "step": 10050 + }, + { + "epoch": 0.58, + "grad_norm": 0.3046875, + "learning_rate": 9.061010628549522e-05, + "loss": 0.9194, + "step": 10055 + }, + { + "epoch": 0.58, + "grad_norm": 0.240234375, + "learning_rate": 9.051040708155606e-05, + "loss": 0.9056, + "step": 10060 + }, + { + "epoch": 0.58, + "grad_norm": 0.29296875, + "learning_rate": 9.041071739501538e-05, + "loss": 0.9462, + "step": 10065 + }, + { + "epoch": 0.58, + "grad_norm": 0.251953125, + "learning_rate": 9.0311037325855e-05, + "loss": 0.8849, + "step": 10070 + }, + { + "epoch": 0.58, + "grad_norm": 0.263671875, + "learning_rate": 9.021136697404706e-05, + "loss": 0.9928, + "step": 10075 + }, + { + "epoch": 0.58, + "grad_norm": 0.265625, + "learning_rate": 9.011170643955394e-05, + "loss": 0.91, + "step": 10080 + }, + { + "epoch": 0.58, + "grad_norm": 0.255859375, + "learning_rate": 9.001205582232825e-05, + "loss": 0.913, + "step": 10085 + }, + { + "epoch": 0.58, + "grad_norm": 0.251953125, + "learning_rate": 8.991241522231252e-05, + "loss": 0.9421, + "step": 10090 + }, + { + "epoch": 0.58, + "grad_norm": 0.2421875, + "learning_rate": 8.981278473943936e-05, + "loss": 0.9712, + "step": 10095 + }, + { + "epoch": 0.58, + "grad_norm": 0.271484375, + "learning_rate": 8.971316447363115e-05, + "loss": 0.9144, + "step": 10100 + }, + { + "epoch": 0.58, + "grad_norm": 0.2578125, + "learning_rate": 8.96135545248001e-05, + "loss": 0.9127, + "step": 10105 + }, + { + "epoch": 0.58, + "grad_norm": 0.267578125, + "learning_rate": 8.951395499284797e-05, + "loss": 0.9771, + "step": 10110 + }, + { + "epoch": 0.58, + "grad_norm": 0.263671875, + "learning_rate": 8.941436597766616e-05, + "loss": 0.9841, + "step": 10115 + }, + { + "epoch": 0.58, + "grad_norm": 0.2451171875, + "learning_rate": 8.93147875791355e-05, + "loss": 0.9138, + "step": 10120 + }, + { + "epoch": 0.58, + "grad_norm": 0.23828125, + "learning_rate": 8.921521989712611e-05, + "loss": 0.9643, + "step": 10125 + }, + { + "epoch": 0.58, + "grad_norm": 0.2353515625, + "learning_rate": 8.91156630314975e-05, + "loss": 0.9428, + "step": 10130 + }, + { + "epoch": 0.58, + "grad_norm": 0.267578125, + "learning_rate": 8.901611708209818e-05, + "loss": 0.9558, + "step": 10135 + }, + { + "epoch": 0.58, + "grad_norm": 0.2470703125, + "learning_rate": 8.891658214876585e-05, + "loss": 0.9425, + "step": 10140 + }, + { + "epoch": 0.58, + "grad_norm": 0.23828125, + "learning_rate": 8.8817058331327e-05, + "loss": 0.9372, + "step": 10145 + }, + { + "epoch": 0.58, + "grad_norm": 0.271484375, + "learning_rate": 8.871754572959716e-05, + "loss": 0.9263, + "step": 10150 + }, + { + "epoch": 0.58, + "grad_norm": 0.2470703125, + "learning_rate": 8.861804444338045e-05, + "loss": 0.9587, + "step": 10155 + }, + { + "epoch": 0.58, + "grad_norm": 0.2490234375, + "learning_rate": 8.851855457246972e-05, + "loss": 0.9165, + "step": 10160 + }, + { + "epoch": 0.58, + "grad_norm": 0.26953125, + "learning_rate": 8.841907621664638e-05, + "loss": 0.9305, + "step": 10165 + }, + { + "epoch": 0.58, + "grad_norm": 0.2578125, + "learning_rate": 8.83196094756802e-05, + "loss": 0.8434, + "step": 10170 + }, + { + "epoch": 0.58, + "grad_norm": 0.27734375, + "learning_rate": 8.82201544493295e-05, + "loss": 0.9661, + "step": 10175 + }, + { + "epoch": 0.58, + "grad_norm": 0.244140625, + "learning_rate": 8.812071123734058e-05, + "loss": 0.9175, + "step": 10180 + }, + { + "epoch": 0.58, + "grad_norm": 0.263671875, + "learning_rate": 8.802127993944814e-05, + "loss": 0.9114, + "step": 10185 + }, + { + "epoch": 0.58, + "grad_norm": 0.2451171875, + "learning_rate": 8.792186065537473e-05, + "loss": 0.9737, + "step": 10190 + }, + { + "epoch": 0.58, + "grad_norm": 0.255859375, + "learning_rate": 8.782245348483104e-05, + "loss": 0.992, + "step": 10195 + }, + { + "epoch": 0.59, + "grad_norm": 0.2392578125, + "learning_rate": 8.772305852751542e-05, + "loss": 0.9458, + "step": 10200 + }, + { + "epoch": 0.59, + "grad_norm": 0.294921875, + "learning_rate": 8.762367588311414e-05, + "loss": 0.8808, + "step": 10205 + }, + { + "epoch": 0.59, + "grad_norm": 0.263671875, + "learning_rate": 8.752430565130103e-05, + "loss": 0.9249, + "step": 10210 + }, + { + "epoch": 0.59, + "grad_norm": 0.265625, + "learning_rate": 8.742494793173743e-05, + "loss": 0.9526, + "step": 10215 + }, + { + "epoch": 0.59, + "grad_norm": 0.25390625, + "learning_rate": 8.73256028240723e-05, + "loss": 1.0145, + "step": 10220 + }, + { + "epoch": 0.59, + "grad_norm": 0.265625, + "learning_rate": 8.722627042794171e-05, + "loss": 0.9798, + "step": 10225 + }, + { + "epoch": 0.59, + "grad_norm": 0.255859375, + "learning_rate": 8.712695084296924e-05, + "loss": 0.8368, + "step": 10230 + }, + { + "epoch": 0.59, + "grad_norm": 0.26953125, + "learning_rate": 8.702764416876537e-05, + "loss": 0.9953, + "step": 10235 + }, + { + "epoch": 0.59, + "grad_norm": 0.255859375, + "learning_rate": 8.692835050492785e-05, + "loss": 0.9289, + "step": 10240 + }, + { + "epoch": 0.59, + "grad_norm": 0.2490234375, + "learning_rate": 8.682906995104125e-05, + "loss": 0.8819, + "step": 10245 + }, + { + "epoch": 0.59, + "grad_norm": 0.2890625, + "learning_rate": 8.672980260667702e-05, + "loss": 0.9383, + "step": 10250 + }, + { + "epoch": 0.59, + "grad_norm": 0.25, + "learning_rate": 8.663054857139339e-05, + "loss": 0.9766, + "step": 10255 + }, + { + "epoch": 0.59, + "grad_norm": 0.271484375, + "learning_rate": 8.653130794473517e-05, + "loss": 0.9114, + "step": 10260 + }, + { + "epoch": 0.59, + "grad_norm": 0.2451171875, + "learning_rate": 8.643208082623386e-05, + "loss": 0.8984, + "step": 10265 + }, + { + "epoch": 0.59, + "grad_norm": 0.2734375, + "learning_rate": 8.633286731540722e-05, + "loss": 0.9411, + "step": 10270 + }, + { + "epoch": 0.59, + "grad_norm": 0.259765625, + "learning_rate": 8.623366751175958e-05, + "loss": 0.9607, + "step": 10275 + }, + { + "epoch": 0.59, + "grad_norm": 0.2490234375, + "learning_rate": 8.613448151478131e-05, + "loss": 0.9819, + "step": 10280 + }, + { + "epoch": 0.59, + "grad_norm": 0.251953125, + "learning_rate": 8.603530942394908e-05, + "loss": 0.9658, + "step": 10285 + }, + { + "epoch": 0.59, + "grad_norm": 0.26953125, + "learning_rate": 8.593615133872558e-05, + "loss": 0.9323, + "step": 10290 + }, + { + "epoch": 0.59, + "grad_norm": 0.26171875, + "learning_rate": 8.583700735855941e-05, + "loss": 0.9315, + "step": 10295 + }, + { + "epoch": 0.59, + "grad_norm": 0.294921875, + "learning_rate": 8.57378775828851e-05, + "loss": 0.9985, + "step": 10300 + }, + { + "epoch": 0.59, + "grad_norm": 0.255859375, + "learning_rate": 8.563876211112282e-05, + "loss": 0.9708, + "step": 10305 + }, + { + "epoch": 0.59, + "grad_norm": 0.251953125, + "learning_rate": 8.553966104267852e-05, + "loss": 0.9249, + "step": 10310 + }, + { + "epoch": 0.59, + "grad_norm": 0.259765625, + "learning_rate": 8.544057447694358e-05, + "loss": 0.9349, + "step": 10315 + }, + { + "epoch": 0.59, + "grad_norm": 0.259765625, + "learning_rate": 8.534150251329494e-05, + "loss": 0.9589, + "step": 10320 + }, + { + "epoch": 0.59, + "grad_norm": 0.259765625, + "learning_rate": 8.52424452510949e-05, + "loss": 0.926, + "step": 10325 + }, + { + "epoch": 0.59, + "grad_norm": 0.248046875, + "learning_rate": 8.514340278969089e-05, + "loss": 0.87, + "step": 10330 + }, + { + "epoch": 0.59, + "grad_norm": 0.265625, + "learning_rate": 8.504437522841566e-05, + "loss": 0.966, + "step": 10335 + }, + { + "epoch": 0.59, + "grad_norm": 0.2470703125, + "learning_rate": 8.494536266658687e-05, + "loss": 0.9709, + "step": 10340 + }, + { + "epoch": 0.59, + "grad_norm": 0.255859375, + "learning_rate": 8.484636520350724e-05, + "loss": 1.087, + "step": 10345 + }, + { + "epoch": 0.59, + "grad_norm": 0.26953125, + "learning_rate": 8.474738293846424e-05, + "loss": 0.9734, + "step": 10350 + }, + { + "epoch": 0.59, + "grad_norm": 0.2578125, + "learning_rate": 8.464841597073024e-05, + "loss": 0.9621, + "step": 10355 + }, + { + "epoch": 0.59, + "grad_norm": 0.267578125, + "learning_rate": 8.454946439956213e-05, + "loss": 0.9551, + "step": 10360 + }, + { + "epoch": 0.59, + "grad_norm": 0.2734375, + "learning_rate": 8.445052832420146e-05, + "loss": 0.9701, + "step": 10365 + }, + { + "epoch": 0.6, + "grad_norm": 0.2578125, + "learning_rate": 8.435160784387423e-05, + "loss": 0.9783, + "step": 10370 + }, + { + "epoch": 0.6, + "grad_norm": 0.28125, + "learning_rate": 8.425270305779069e-05, + "loss": 0.9546, + "step": 10375 + }, + { + "epoch": 0.6, + "grad_norm": 0.2734375, + "learning_rate": 8.415381406514551e-05, + "loss": 0.9028, + "step": 10380 + }, + { + "epoch": 0.6, + "grad_norm": 0.24609375, + "learning_rate": 8.405494096511737e-05, + "loss": 0.9537, + "step": 10385 + }, + { + "epoch": 0.6, + "grad_norm": 0.26953125, + "learning_rate": 8.395608385686911e-05, + "loss": 0.8804, + "step": 10390 + }, + { + "epoch": 0.6, + "grad_norm": 0.25, + "learning_rate": 8.38572428395475e-05, + "loss": 0.9221, + "step": 10395 + }, + { + "epoch": 0.6, + "grad_norm": 0.265625, + "learning_rate": 8.37584180122832e-05, + "loss": 0.9675, + "step": 10400 + }, + { + "epoch": 0.6, + "grad_norm": 0.26953125, + "learning_rate": 8.365960947419054e-05, + "loss": 0.9677, + "step": 10405 + }, + { + "epoch": 0.6, + "grad_norm": 0.251953125, + "learning_rate": 8.356081732436759e-05, + "loss": 0.9545, + "step": 10410 + }, + { + "epoch": 0.6, + "grad_norm": 0.2373046875, + "learning_rate": 8.346204166189607e-05, + "loss": 0.8864, + "step": 10415 + }, + { + "epoch": 0.6, + "grad_norm": 0.2451171875, + "learning_rate": 8.336328258584093e-05, + "loss": 0.9734, + "step": 10420 + }, + { + "epoch": 0.6, + "grad_norm": 0.25, + "learning_rate": 8.326454019525072e-05, + "loss": 0.9281, + "step": 10425 + }, + { + "epoch": 0.6, + "grad_norm": 0.26171875, + "learning_rate": 8.316581458915711e-05, + "loss": 0.9169, + "step": 10430 + }, + { + "epoch": 0.6, + "grad_norm": 0.267578125, + "learning_rate": 8.3067105866575e-05, + "loss": 0.9249, + "step": 10435 + }, + { + "epoch": 0.6, + "grad_norm": 0.271484375, + "learning_rate": 8.296841412650233e-05, + "loss": 1.0116, + "step": 10440 + }, + { + "epoch": 0.6, + "grad_norm": 0.265625, + "learning_rate": 8.286973946792e-05, + "loss": 0.9494, + "step": 10445 + }, + { + "epoch": 0.6, + "grad_norm": 0.251953125, + "learning_rate": 8.277108198979188e-05, + "loss": 0.9606, + "step": 10450 + }, + { + "epoch": 0.6, + "grad_norm": 0.265625, + "learning_rate": 8.267244179106441e-05, + "loss": 0.9395, + "step": 10455 + }, + { + "epoch": 0.6, + "grad_norm": 0.2578125, + "learning_rate": 8.257381897066691e-05, + "loss": 0.9427, + "step": 10460 + }, + { + "epoch": 0.6, + "grad_norm": 0.265625, + "learning_rate": 8.24752136275111e-05, + "loss": 0.9139, + "step": 10465 + }, + { + "epoch": 0.6, + "grad_norm": 0.259765625, + "learning_rate": 8.237662586049133e-05, + "loss": 1.0039, + "step": 10470 + }, + { + "epoch": 0.6, + "grad_norm": 0.306640625, + "learning_rate": 8.227805576848418e-05, + "loss": 0.9437, + "step": 10475 + }, + { + "epoch": 0.6, + "grad_norm": 0.28125, + "learning_rate": 8.217950345034858e-05, + "loss": 0.9363, + "step": 10480 + }, + { + "epoch": 0.6, + "grad_norm": 0.263671875, + "learning_rate": 8.208096900492562e-05, + "loss": 0.9247, + "step": 10485 + }, + { + "epoch": 0.6, + "grad_norm": 0.251953125, + "learning_rate": 8.198245253103843e-05, + "loss": 0.9966, + "step": 10490 + }, + { + "epoch": 0.6, + "grad_norm": 0.2578125, + "learning_rate": 8.188395412749223e-05, + "loss": 1.0299, + "step": 10495 + }, + { + "epoch": 0.6, + "grad_norm": 0.255859375, + "learning_rate": 8.178547389307393e-05, + "loss": 0.9266, + "step": 10500 + }, + { + "epoch": 0.6, + "grad_norm": 0.26953125, + "learning_rate": 8.168701192655243e-05, + "loss": 1.0203, + "step": 10505 + }, + { + "epoch": 0.6, + "grad_norm": 0.283203125, + "learning_rate": 8.158856832667811e-05, + "loss": 0.9132, + "step": 10510 + }, + { + "epoch": 0.6, + "grad_norm": 0.251953125, + "learning_rate": 8.14901431921831e-05, + "loss": 0.9272, + "step": 10515 + }, + { + "epoch": 0.6, + "grad_norm": 0.2578125, + "learning_rate": 8.139173662178086e-05, + "loss": 0.9588, + "step": 10520 + }, + { + "epoch": 0.6, + "grad_norm": 0.271484375, + "learning_rate": 8.129334871416632e-05, + "loss": 0.9455, + "step": 10525 + }, + { + "epoch": 0.6, + "grad_norm": 0.25, + "learning_rate": 8.119497956801571e-05, + "loss": 0.9381, + "step": 10530 + }, + { + "epoch": 0.6, + "grad_norm": 0.26953125, + "learning_rate": 8.109662928198638e-05, + "loss": 1.0069, + "step": 10535 + }, + { + "epoch": 0.6, + "grad_norm": 0.287109375, + "learning_rate": 8.099829795471683e-05, + "loss": 0.9718, + "step": 10540 + }, + { + "epoch": 0.61, + "grad_norm": 0.26171875, + "learning_rate": 8.089998568482643e-05, + "loss": 0.9662, + "step": 10545 + }, + { + "epoch": 0.61, + "grad_norm": 0.2578125, + "learning_rate": 8.080169257091562e-05, + "loss": 0.9271, + "step": 10550 + }, + { + "epoch": 0.61, + "grad_norm": 0.2734375, + "learning_rate": 8.070341871156541e-05, + "loss": 1.0115, + "step": 10555 + }, + { + "epoch": 0.61, + "grad_norm": 0.255859375, + "learning_rate": 8.060516420533774e-05, + "loss": 0.938, + "step": 10560 + }, + { + "epoch": 0.61, + "grad_norm": 0.251953125, + "learning_rate": 8.050692915077489e-05, + "loss": 0.8801, + "step": 10565 + }, + { + "epoch": 0.61, + "grad_norm": 0.26171875, + "learning_rate": 8.040871364639983e-05, + "loss": 0.9114, + "step": 10570 + }, + { + "epoch": 0.61, + "grad_norm": 0.265625, + "learning_rate": 8.031051779071587e-05, + "loss": 0.9507, + "step": 10575 + }, + { + "epoch": 0.61, + "grad_norm": 0.2578125, + "learning_rate": 8.021234168220649e-05, + "loss": 0.9484, + "step": 10580 + }, + { + "epoch": 0.61, + "grad_norm": 0.26953125, + "learning_rate": 8.011418541933558e-05, + "loss": 0.9998, + "step": 10585 + }, + { + "epoch": 0.61, + "grad_norm": 0.259765625, + "learning_rate": 8.00160491005469e-05, + "loss": 0.9445, + "step": 10590 + }, + { + "epoch": 0.61, + "grad_norm": 0.251953125, + "learning_rate": 7.991793282426442e-05, + "loss": 0.9108, + "step": 10595 + }, + { + "epoch": 0.61, + "grad_norm": 0.271484375, + "learning_rate": 7.981983668889182e-05, + "loss": 0.9833, + "step": 10600 + }, + { + "epoch": 0.61, + "grad_norm": 0.2734375, + "learning_rate": 7.972176079281275e-05, + "loss": 0.9454, + "step": 10605 + }, + { + "epoch": 0.61, + "grad_norm": 0.271484375, + "learning_rate": 7.962370523439044e-05, + "loss": 0.9443, + "step": 10610 + }, + { + "epoch": 0.61, + "grad_norm": 0.28515625, + "learning_rate": 7.952567011196774e-05, + "loss": 0.9786, + "step": 10615 + }, + { + "epoch": 0.61, + "grad_norm": 0.2734375, + "learning_rate": 7.942765552386709e-05, + "loss": 0.9386, + "step": 10620 + }, + { + "epoch": 0.61, + "grad_norm": 0.255859375, + "learning_rate": 7.932966156839018e-05, + "loss": 1.0092, + "step": 10625 + }, + { + "epoch": 0.61, + "grad_norm": 0.2451171875, + "learning_rate": 7.923168834381822e-05, + "loss": 0.8734, + "step": 10630 + }, + { + "epoch": 0.61, + "grad_norm": 0.267578125, + "learning_rate": 7.913373594841139e-05, + "loss": 0.9287, + "step": 10635 + }, + { + "epoch": 0.61, + "grad_norm": 0.2451171875, + "learning_rate": 7.903580448040917e-05, + "loss": 0.9007, + "step": 10640 + }, + { + "epoch": 0.61, + "grad_norm": 0.25, + "learning_rate": 7.893789403802992e-05, + "loss": 0.9592, + "step": 10645 + }, + { + "epoch": 0.61, + "grad_norm": 0.25390625, + "learning_rate": 7.884000471947104e-05, + "loss": 0.9578, + "step": 10650 + }, + { + "epoch": 0.61, + "grad_norm": 0.265625, + "learning_rate": 7.874213662290862e-05, + "loss": 0.9031, + "step": 10655 + }, + { + "epoch": 0.61, + "grad_norm": 0.26171875, + "learning_rate": 7.864428984649757e-05, + "loss": 0.9133, + "step": 10660 + }, + { + "epoch": 0.61, + "grad_norm": 0.251953125, + "learning_rate": 7.854646448837134e-05, + "loss": 0.9158, + "step": 10665 + }, + { + "epoch": 0.61, + "grad_norm": 0.25, + "learning_rate": 7.844866064664189e-05, + "loss": 0.9334, + "step": 10670 + }, + { + "epoch": 0.61, + "grad_norm": 0.255859375, + "learning_rate": 7.835087841939973e-05, + "loss": 0.9653, + "step": 10675 + }, + { + "epoch": 0.61, + "grad_norm": 0.26171875, + "learning_rate": 7.82531179047135e-05, + "loss": 0.9604, + "step": 10680 + }, + { + "epoch": 0.61, + "grad_norm": 0.2490234375, + "learning_rate": 7.815537920063019e-05, + "loss": 1.0108, + "step": 10685 + }, + { + "epoch": 0.61, + "grad_norm": 0.2451171875, + "learning_rate": 7.805766240517498e-05, + "loss": 0.9327, + "step": 10690 + }, + { + "epoch": 0.61, + "grad_norm": 0.267578125, + "learning_rate": 7.795996761635087e-05, + "loss": 0.9369, + "step": 10695 + }, + { + "epoch": 0.61, + "grad_norm": 0.267578125, + "learning_rate": 7.786229493213901e-05, + "loss": 0.9592, + "step": 10700 + }, + { + "epoch": 0.61, + "grad_norm": 0.2578125, + "learning_rate": 7.776464445049817e-05, + "loss": 0.8983, + "step": 10705 + }, + { + "epoch": 0.61, + "grad_norm": 0.263671875, + "learning_rate": 7.766701626936505e-05, + "loss": 0.9935, + "step": 10710 + }, + { + "epoch": 0.61, + "grad_norm": 0.275390625, + "learning_rate": 7.75694104866538e-05, + "loss": 0.9206, + "step": 10715 + }, + { + "epoch": 0.62, + "grad_norm": 0.2578125, + "learning_rate": 7.74718272002563e-05, + "loss": 0.9622, + "step": 10720 + }, + { + "epoch": 0.62, + "grad_norm": 0.267578125, + "learning_rate": 7.737426650804168e-05, + "loss": 0.9379, + "step": 10725 + }, + { + "epoch": 0.62, + "grad_norm": 0.263671875, + "learning_rate": 7.727672850785651e-05, + "loss": 0.9329, + "step": 10730 + }, + { + "epoch": 0.62, + "grad_norm": 0.244140625, + "learning_rate": 7.717921329752466e-05, + "loss": 1.0257, + "step": 10735 + }, + { + "epoch": 0.62, + "grad_norm": 0.259765625, + "learning_rate": 7.708172097484699e-05, + "loss": 0.8839, + "step": 10740 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.698425163760156e-05, + "loss": 0.9445, + "step": 10745 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.688680538354323e-05, + "loss": 0.9536, + "step": 10750 + }, + { + "epoch": 0.62, + "grad_norm": 0.255859375, + "learning_rate": 7.678938231040383e-05, + "loss": 0.8843, + "step": 10755 + }, + { + "epoch": 0.62, + "grad_norm": 0.24609375, + "learning_rate": 7.669198251589188e-05, + "loss": 0.9955, + "step": 10760 + }, + { + "epoch": 0.62, + "grad_norm": 0.240234375, + "learning_rate": 7.659460609769252e-05, + "loss": 1.0015, + "step": 10765 + }, + { + "epoch": 0.62, + "grad_norm": 0.255859375, + "learning_rate": 7.649725315346761e-05, + "loss": 0.9057, + "step": 10770 + }, + { + "epoch": 0.62, + "grad_norm": 0.2578125, + "learning_rate": 7.639992378085521e-05, + "loss": 0.9174, + "step": 10775 + }, + { + "epoch": 0.62, + "grad_norm": 0.25390625, + "learning_rate": 7.630261807747e-05, + "loss": 0.929, + "step": 10780 + }, + { + "epoch": 0.62, + "grad_norm": 0.2451171875, + "learning_rate": 7.620533614090269e-05, + "loss": 0.9768, + "step": 10785 + }, + { + "epoch": 0.62, + "grad_norm": 0.263671875, + "learning_rate": 7.610807806872038e-05, + "loss": 1.0064, + "step": 10790 + }, + { + "epoch": 0.62, + "grad_norm": 0.275390625, + "learning_rate": 7.601084395846603e-05, + "loss": 0.9434, + "step": 10795 + }, + { + "epoch": 0.62, + "grad_norm": 0.248046875, + "learning_rate": 7.591363390765868e-05, + "loss": 0.9428, + "step": 10800 + }, + { + "epoch": 0.62, + "grad_norm": 0.2490234375, + "learning_rate": 7.581644801379324e-05, + "loss": 0.9157, + "step": 10805 + }, + { + "epoch": 0.62, + "grad_norm": 0.265625, + "learning_rate": 7.571928637434031e-05, + "loss": 0.948, + "step": 10810 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.562214908674633e-05, + "loss": 1.0172, + "step": 10815 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.55250362484331e-05, + "loss": 0.9701, + "step": 10820 + }, + { + "epoch": 0.62, + "grad_norm": 0.251953125, + "learning_rate": 7.542794795679811e-05, + "loss": 0.9276, + "step": 10825 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.533088430921402e-05, + "loss": 0.9317, + "step": 10830 + }, + { + "epoch": 0.62, + "grad_norm": 0.279296875, + "learning_rate": 7.5233845403029e-05, + "loss": 0.9199, + "step": 10835 + }, + { + "epoch": 0.62, + "grad_norm": 0.2373046875, + "learning_rate": 7.51368313355662e-05, + "loss": 0.8962, + "step": 10840 + }, + { + "epoch": 0.62, + "grad_norm": 0.267578125, + "learning_rate": 7.5039842204124e-05, + "loss": 0.8965, + "step": 10845 + }, + { + "epoch": 0.62, + "grad_norm": 0.248046875, + "learning_rate": 7.49428781059757e-05, + "loss": 0.9054, + "step": 10850 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.484593913836951e-05, + "loss": 0.9142, + "step": 10855 + }, + { + "epoch": 0.62, + "grad_norm": 0.267578125, + "learning_rate": 7.474902539852848e-05, + "loss": 0.9123, + "step": 10860 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.465213698365026e-05, + "loss": 0.9701, + "step": 10865 + }, + { + "epoch": 0.62, + "grad_norm": 0.255859375, + "learning_rate": 7.455527399090721e-05, + "loss": 0.97, + "step": 10870 + }, + { + "epoch": 0.62, + "grad_norm": 0.251953125, + "learning_rate": 7.445843651744609e-05, + "loss": 0.9103, + "step": 10875 + }, + { + "epoch": 0.62, + "grad_norm": 0.26171875, + "learning_rate": 7.436162466038818e-05, + "loss": 0.9494, + "step": 10880 + }, + { + "epoch": 0.62, + "grad_norm": 0.251953125, + "learning_rate": 7.426483851682898e-05, + "loss": 0.9303, + "step": 10885 + }, + { + "epoch": 0.62, + "grad_norm": 0.251953125, + "learning_rate": 7.416807818383817e-05, + "loss": 0.9832, + "step": 10890 + }, + { + "epoch": 0.63, + "grad_norm": 0.2470703125, + "learning_rate": 7.407134375845972e-05, + "loss": 0.8896, + "step": 10895 + }, + { + "epoch": 0.63, + "grad_norm": 0.25, + "learning_rate": 7.397463533771139e-05, + "loss": 0.9505, + "step": 10900 + }, + { + "epoch": 0.63, + "grad_norm": 0.255859375, + "learning_rate": 7.387795301858504e-05, + "loss": 0.9017, + "step": 10905 + }, + { + "epoch": 0.63, + "grad_norm": 0.26171875, + "learning_rate": 7.378129689804623e-05, + "loss": 0.9594, + "step": 10910 + }, + { + "epoch": 0.63, + "grad_norm": 0.259765625, + "learning_rate": 7.368466707303434e-05, + "loss": 0.9913, + "step": 10915 + }, + { + "epoch": 0.63, + "grad_norm": 0.265625, + "learning_rate": 7.358806364046226e-05, + "loss": 1.0053, + "step": 10920 + }, + { + "epoch": 0.63, + "grad_norm": 0.279296875, + "learning_rate": 7.349148669721658e-05, + "loss": 0.9282, + "step": 10925 + }, + { + "epoch": 0.63, + "grad_norm": 0.25, + "learning_rate": 7.339493634015711e-05, + "loss": 0.9456, + "step": 10930 + }, + { + "epoch": 0.63, + "grad_norm": 0.244140625, + "learning_rate": 7.329841266611721e-05, + "loss": 0.9272, + "step": 10935 + }, + { + "epoch": 0.63, + "grad_norm": 0.283203125, + "learning_rate": 7.320191577190336e-05, + "loss": 0.9377, + "step": 10940 + }, + { + "epoch": 0.63, + "grad_norm": 0.25390625, + "learning_rate": 7.310544575429514e-05, + "loss": 0.8861, + "step": 10945 + }, + { + "epoch": 0.63, + "grad_norm": 0.259765625, + "learning_rate": 7.300900271004534e-05, + "loss": 0.9117, + "step": 10950 + }, + { + "epoch": 0.63, + "grad_norm": 0.296875, + "learning_rate": 7.291258673587947e-05, + "loss": 0.9802, + "step": 10955 + }, + { + "epoch": 0.63, + "grad_norm": 0.267578125, + "learning_rate": 7.281619792849612e-05, + "loss": 0.9195, + "step": 10960 + }, + { + "epoch": 0.63, + "grad_norm": 0.2578125, + "learning_rate": 7.271983638456644e-05, + "loss": 1.0314, + "step": 10965 + }, + { + "epoch": 0.63, + "grad_norm": 0.251953125, + "learning_rate": 7.26235022007344e-05, + "loss": 0.9812, + "step": 10970 + }, + { + "epoch": 0.63, + "grad_norm": 0.2578125, + "learning_rate": 7.252719547361641e-05, + "loss": 0.9425, + "step": 10975 + }, + { + "epoch": 0.63, + "grad_norm": 0.267578125, + "learning_rate": 7.243091629980141e-05, + "loss": 0.9824, + "step": 10980 + }, + { + "epoch": 0.63, + "grad_norm": 0.271484375, + "learning_rate": 7.233466477585068e-05, + "loss": 0.8939, + "step": 10985 + }, + { + "epoch": 0.63, + "grad_norm": 0.26171875, + "learning_rate": 7.223844099829773e-05, + "loss": 0.9015, + "step": 10990 + }, + { + "epoch": 0.63, + "grad_norm": 0.234375, + "learning_rate": 7.214224506364834e-05, + "loss": 0.8971, + "step": 10995 + }, + { + "epoch": 0.63, + "grad_norm": 0.283203125, + "learning_rate": 7.204607706838026e-05, + "loss": 0.9336, + "step": 11000 + }, + { + "epoch": 0.63, + "grad_norm": 0.275390625, + "learning_rate": 7.194993710894335e-05, + "loss": 1.0268, + "step": 11005 + }, + { + "epoch": 0.63, + "grad_norm": 0.259765625, + "learning_rate": 7.185382528175917e-05, + "loss": 0.8822, + "step": 11010 + }, + { + "epoch": 0.63, + "grad_norm": 0.26171875, + "learning_rate": 7.175774168322123e-05, + "loss": 1.0541, + "step": 11015 + }, + { + "epoch": 0.63, + "grad_norm": 0.25, + "learning_rate": 7.166168640969464e-05, + "loss": 0.9136, + "step": 11020 + }, + { + "epoch": 0.63, + "grad_norm": 0.255859375, + "learning_rate": 7.156565955751616e-05, + "loss": 0.9599, + "step": 11025 + }, + { + "epoch": 0.63, + "grad_norm": 0.25390625, + "learning_rate": 7.146966122299396e-05, + "loss": 0.9775, + "step": 11030 + }, + { + "epoch": 0.63, + "grad_norm": 0.265625, + "learning_rate": 7.137369150240769e-05, + "loss": 0.977, + "step": 11035 + }, + { + "epoch": 0.63, + "grad_norm": 0.28515625, + "learning_rate": 7.127775049200828e-05, + "loss": 0.9294, + "step": 11040 + }, + { + "epoch": 0.63, + "grad_norm": 0.2578125, + "learning_rate": 7.118183828801781e-05, + "loss": 0.9713, + "step": 11045 + }, + { + "epoch": 0.63, + "grad_norm": 0.2578125, + "learning_rate": 7.108595498662956e-05, + "loss": 0.924, + "step": 11050 + }, + { + "epoch": 0.63, + "grad_norm": 0.255859375, + "learning_rate": 7.099010068400781e-05, + "loss": 0.9111, + "step": 11055 + }, + { + "epoch": 0.63, + "grad_norm": 0.2392578125, + "learning_rate": 7.089427547628766e-05, + "loss": 0.9591, + "step": 11060 + }, + { + "epoch": 0.63, + "grad_norm": 0.24609375, + "learning_rate": 7.079847945957516e-05, + "loss": 0.8856, + "step": 11065 + }, + { + "epoch": 0.64, + "grad_norm": 0.27734375, + "learning_rate": 7.070271272994698e-05, + "loss": 0.9428, + "step": 11070 + }, + { + "epoch": 0.64, + "grad_norm": 0.248046875, + "learning_rate": 7.060697538345048e-05, + "loss": 0.9358, + "step": 11075 + }, + { + "epoch": 0.64, + "grad_norm": 0.251953125, + "learning_rate": 7.051126751610346e-05, + "loss": 0.9376, + "step": 11080 + }, + { + "epoch": 0.64, + "grad_norm": 0.26171875, + "learning_rate": 7.041558922389434e-05, + "loss": 0.9481, + "step": 11085 + }, + { + "epoch": 0.64, + "grad_norm": 0.263671875, + "learning_rate": 7.031994060278162e-05, + "loss": 0.9089, + "step": 11090 + }, + { + "epoch": 0.64, + "grad_norm": 0.26953125, + "learning_rate": 7.02243217486943e-05, + "loss": 0.9229, + "step": 11095 + }, + { + "epoch": 0.64, + "grad_norm": 0.248046875, + "learning_rate": 7.012873275753137e-05, + "loss": 0.8838, + "step": 11100 + }, + { + "epoch": 0.64, + "grad_norm": 0.271484375, + "learning_rate": 7.003317372516189e-05, + "loss": 0.9375, + "step": 11105 + }, + { + "epoch": 0.64, + "grad_norm": 0.25, + "learning_rate": 6.993764474742493e-05, + "loss": 0.8994, + "step": 11110 + }, + { + "epoch": 0.64, + "grad_norm": 0.2451171875, + "learning_rate": 6.984214592012935e-05, + "loss": 0.9055, + "step": 11115 + }, + { + "epoch": 0.64, + "grad_norm": 0.28515625, + "learning_rate": 6.974667733905377e-05, + "loss": 0.9884, + "step": 11120 + }, + { + "epoch": 0.64, + "grad_norm": 0.251953125, + "learning_rate": 6.965123909994658e-05, + "loss": 0.9248, + "step": 11125 + }, + { + "epoch": 0.64, + "grad_norm": 0.2412109375, + "learning_rate": 6.955583129852559e-05, + "loss": 0.9173, + "step": 11130 + }, + { + "epoch": 0.64, + "grad_norm": 0.244140625, + "learning_rate": 6.946045403047821e-05, + "loss": 0.9609, + "step": 11135 + }, + { + "epoch": 0.64, + "grad_norm": 0.279296875, + "learning_rate": 6.936510739146113e-05, + "loss": 0.9058, + "step": 11140 + }, + { + "epoch": 0.64, + "grad_norm": 0.28515625, + "learning_rate": 6.926979147710044e-05, + "loss": 0.9753, + "step": 11145 + }, + { + "epoch": 0.64, + "grad_norm": 0.25390625, + "learning_rate": 6.917450638299123e-05, + "loss": 0.9487, + "step": 11150 + }, + { + "epoch": 0.64, + "grad_norm": 0.255859375, + "learning_rate": 6.90792522046979e-05, + "loss": 0.9352, + "step": 11155 + }, + { + "epoch": 0.64, + "grad_norm": 0.259765625, + "learning_rate": 6.898402903775369e-05, + "loss": 0.9578, + "step": 11160 + }, + { + "epoch": 0.64, + "grad_norm": 0.279296875, + "learning_rate": 6.888883697766076e-05, + "loss": 0.9074, + "step": 11165 + }, + { + "epoch": 0.64, + "grad_norm": 0.275390625, + "learning_rate": 6.87936761198901e-05, + "loss": 1.0342, + "step": 11170 + }, + { + "epoch": 0.64, + "grad_norm": 0.259765625, + "learning_rate": 6.869854655988139e-05, + "loss": 0.9576, + "step": 11175 + }, + { + "epoch": 0.64, + "grad_norm": 0.283203125, + "learning_rate": 6.860344839304299e-05, + "loss": 0.9478, + "step": 11180 + }, + { + "epoch": 0.64, + "grad_norm": 0.267578125, + "learning_rate": 6.850838171475165e-05, + "loss": 0.9573, + "step": 11185 + }, + { + "epoch": 0.64, + "grad_norm": 0.251953125, + "learning_rate": 6.841334662035266e-05, + "loss": 0.963, + "step": 11190 + }, + { + "epoch": 0.64, + "grad_norm": 0.259765625, + "learning_rate": 6.83183432051595e-05, + "loss": 0.9702, + "step": 11195 + }, + { + "epoch": 0.64, + "grad_norm": 0.263671875, + "learning_rate": 6.822337156445406e-05, + "loss": 0.939, + "step": 11200 + }, + { + "epoch": 0.64, + "grad_norm": 0.263671875, + "learning_rate": 6.812843179348618e-05, + "loss": 0.9908, + "step": 11205 + }, + { + "epoch": 0.64, + "grad_norm": 0.271484375, + "learning_rate": 6.803352398747384e-05, + "loss": 0.9683, + "step": 11210 + }, + { + "epoch": 0.64, + "grad_norm": 0.2490234375, + "learning_rate": 6.793864824160295e-05, + "loss": 0.9921, + "step": 11215 + }, + { + "epoch": 0.64, + "grad_norm": 0.279296875, + "learning_rate": 6.78438046510272e-05, + "loss": 0.9828, + "step": 11220 + }, + { + "epoch": 0.64, + "grad_norm": 0.265625, + "learning_rate": 6.774899331086814e-05, + "loss": 1.0154, + "step": 11225 + }, + { + "epoch": 0.64, + "grad_norm": 0.267578125, + "learning_rate": 6.765421431621491e-05, + "loss": 0.9471, + "step": 11230 + }, + { + "epoch": 0.64, + "grad_norm": 0.2578125, + "learning_rate": 6.755946776212421e-05, + "loss": 0.9154, + "step": 11235 + }, + { + "epoch": 0.64, + "grad_norm": 0.244140625, + "learning_rate": 6.746475374362018e-05, + "loss": 0.8874, + "step": 11240 + }, + { + "epoch": 0.65, + "grad_norm": 0.265625, + "learning_rate": 6.737007235569442e-05, + "loss": 0.9187, + "step": 11245 + }, + { + "epoch": 0.65, + "grad_norm": 0.283203125, + "learning_rate": 6.727542369330571e-05, + "loss": 0.9596, + "step": 11250 + }, + { + "epoch": 0.65, + "grad_norm": 0.2333984375, + "learning_rate": 6.718080785138002e-05, + "loss": 0.9327, + "step": 11255 + }, + { + "epoch": 0.65, + "grad_norm": 0.259765625, + "learning_rate": 6.708622492481051e-05, + "loss": 0.9895, + "step": 11260 + }, + { + "epoch": 0.65, + "grad_norm": 0.251953125, + "learning_rate": 6.699167500845714e-05, + "loss": 0.9862, + "step": 11265 + }, + { + "epoch": 0.65, + "grad_norm": 0.248046875, + "learning_rate": 6.689715819714697e-05, + "loss": 0.9216, + "step": 11270 + }, + { + "epoch": 0.65, + "grad_norm": 0.283203125, + "learning_rate": 6.680267458567366e-05, + "loss": 0.9501, + "step": 11275 + }, + { + "epoch": 0.65, + "grad_norm": 0.248046875, + "learning_rate": 6.670822426879776e-05, + "loss": 0.916, + "step": 11280 + }, + { + "epoch": 0.65, + "grad_norm": 0.275390625, + "learning_rate": 6.661380734124625e-05, + "loss": 0.9404, + "step": 11285 + }, + { + "epoch": 0.65, + "grad_norm": 0.2470703125, + "learning_rate": 6.65194238977128e-05, + "loss": 0.9192, + "step": 11290 + }, + { + "epoch": 0.65, + "grad_norm": 0.2470703125, + "learning_rate": 6.642507403285732e-05, + "loss": 0.9325, + "step": 11295 + }, + { + "epoch": 0.65, + "grad_norm": 0.26953125, + "learning_rate": 6.633075784130619e-05, + "loss": 0.9313, + "step": 11300 + }, + { + "epoch": 0.65, + "grad_norm": 0.32421875, + "learning_rate": 6.623647541765195e-05, + "loss": 1.0401, + "step": 11305 + }, + { + "epoch": 0.65, + "grad_norm": 0.26171875, + "learning_rate": 6.614222685645324e-05, + "loss": 1.0222, + "step": 11310 + }, + { + "epoch": 0.65, + "grad_norm": 0.255859375, + "learning_rate": 6.604801225223486e-05, + "loss": 0.892, + "step": 11315 + }, + { + "epoch": 0.65, + "grad_norm": 0.24609375, + "learning_rate": 6.595383169948738e-05, + "loss": 0.8829, + "step": 11320 + }, + { + "epoch": 0.65, + "grad_norm": 0.2490234375, + "learning_rate": 6.58596852926674e-05, + "loss": 0.9097, + "step": 11325 + }, + { + "epoch": 0.65, + "grad_norm": 0.267578125, + "learning_rate": 6.576557312619711e-05, + "loss": 0.9305, + "step": 11330 + }, + { + "epoch": 0.65, + "grad_norm": 0.267578125, + "learning_rate": 6.567149529446447e-05, + "loss": 0.9914, + "step": 11335 + }, + { + "epoch": 0.65, + "grad_norm": 0.26953125, + "learning_rate": 6.5577451891823e-05, + "loss": 0.9554, + "step": 11340 + }, + { + "epoch": 0.65, + "grad_norm": 0.2578125, + "learning_rate": 6.548344301259161e-05, + "loss": 0.9622, + "step": 11345 + }, + { + "epoch": 0.65, + "grad_norm": 0.26171875, + "learning_rate": 6.53894687510547e-05, + "loss": 0.9283, + "step": 11350 + }, + { + "epoch": 0.65, + "grad_norm": 0.259765625, + "learning_rate": 6.52955292014618e-05, + "loss": 0.926, + "step": 11355 + }, + { + "epoch": 0.65, + "grad_norm": 0.2734375, + "learning_rate": 6.52016244580278e-05, + "loss": 0.9567, + "step": 11360 + }, + { + "epoch": 0.65, + "grad_norm": 0.259765625, + "learning_rate": 6.51077546149325e-05, + "loss": 0.9689, + "step": 11365 + }, + { + "epoch": 0.65, + "grad_norm": 0.23828125, + "learning_rate": 6.50139197663209e-05, + "loss": 0.9252, + "step": 11370 + }, + { + "epoch": 0.65, + "grad_norm": 0.2578125, + "learning_rate": 6.492012000630269e-05, + "loss": 0.9499, + "step": 11375 + }, + { + "epoch": 0.65, + "grad_norm": 0.26171875, + "learning_rate": 6.482635542895255e-05, + "loss": 0.9449, + "step": 11380 + }, + { + "epoch": 0.65, + "grad_norm": 0.267578125, + "learning_rate": 6.473262612830977e-05, + "loss": 1.0348, + "step": 11385 + }, + { + "epoch": 0.65, + "grad_norm": 0.255859375, + "learning_rate": 6.46389321983783e-05, + "loss": 0.9567, + "step": 11390 + }, + { + "epoch": 0.65, + "grad_norm": 0.2490234375, + "learning_rate": 6.45452737331266e-05, + "loss": 0.9624, + "step": 11395 + }, + { + "epoch": 0.65, + "grad_norm": 0.2490234375, + "learning_rate": 6.445165082648755e-05, + "loss": 0.9057, + "step": 11400 + }, + { + "epoch": 0.65, + "grad_norm": 0.23828125, + "learning_rate": 6.43580635723584e-05, + "loss": 0.9551, + "step": 11405 + }, + { + "epoch": 0.65, + "grad_norm": 0.2890625, + "learning_rate": 6.426451206460061e-05, + "loss": 0.9804, + "step": 11410 + }, + { + "epoch": 0.65, + "grad_norm": 0.244140625, + "learning_rate": 6.417099639703979e-05, + "loss": 0.9483, + "step": 11415 + }, + { + "epoch": 0.66, + "grad_norm": 0.26171875, + "learning_rate": 6.407751666346569e-05, + "loss": 0.953, + "step": 11420 + }, + { + "epoch": 0.66, + "grad_norm": 0.251953125, + "learning_rate": 6.398407295763187e-05, + "loss": 0.9259, + "step": 11425 + }, + { + "epoch": 0.66, + "grad_norm": 0.26953125, + "learning_rate": 6.38906653732559e-05, + "loss": 0.9765, + "step": 11430 + }, + { + "epoch": 0.66, + "grad_norm": 0.251953125, + "learning_rate": 6.3797294004019e-05, + "loss": 0.9349, + "step": 11435 + }, + { + "epoch": 0.66, + "grad_norm": 0.28125, + "learning_rate": 6.37039589435662e-05, + "loss": 0.9406, + "step": 11440 + }, + { + "epoch": 0.66, + "grad_norm": 0.2578125, + "learning_rate": 6.361066028550593e-05, + "loss": 0.8959, + "step": 11445 + }, + { + "epoch": 0.66, + "grad_norm": 0.26171875, + "learning_rate": 6.351739812341036e-05, + "loss": 0.947, + "step": 11450 + }, + { + "epoch": 0.66, + "grad_norm": 0.267578125, + "learning_rate": 6.342417255081479e-05, + "loss": 0.9025, + "step": 11455 + }, + { + "epoch": 0.66, + "grad_norm": 0.271484375, + "learning_rate": 6.333098366121804e-05, + "loss": 0.9347, + "step": 11460 + }, + { + "epoch": 0.66, + "grad_norm": 0.2412109375, + "learning_rate": 6.323783154808205e-05, + "loss": 0.9275, + "step": 11465 + }, + { + "epoch": 0.66, + "grad_norm": 0.255859375, + "learning_rate": 6.314471630483183e-05, + "loss": 0.9963, + "step": 11470 + }, + { + "epoch": 0.66, + "grad_norm": 0.275390625, + "learning_rate": 6.305163802485554e-05, + "loss": 0.9659, + "step": 11475 + }, + { + "epoch": 0.66, + "grad_norm": 0.244140625, + "learning_rate": 6.29585968015041e-05, + "loss": 0.9556, + "step": 11480 + }, + { + "epoch": 0.66, + "grad_norm": 0.271484375, + "learning_rate": 6.286559272809142e-05, + "loss": 1.0343, + "step": 11485 + }, + { + "epoch": 0.66, + "grad_norm": 0.248046875, + "learning_rate": 6.277262589789406e-05, + "loss": 0.9564, + "step": 11490 + }, + { + "epoch": 0.66, + "grad_norm": 0.251953125, + "learning_rate": 6.267969640415124e-05, + "loss": 0.9114, + "step": 11495 + }, + { + "epoch": 0.66, + "grad_norm": 0.26953125, + "learning_rate": 6.258680434006478e-05, + "loss": 1.0162, + "step": 11500 + }, + { + "epoch": 0.66, + "grad_norm": 0.255859375, + "learning_rate": 6.24939497987989e-05, + "loss": 1.0089, + "step": 11505 + }, + { + "epoch": 0.66, + "grad_norm": 0.26953125, + "learning_rate": 6.240113287348026e-05, + "loss": 0.9537, + "step": 11510 + }, + { + "epoch": 0.66, + "grad_norm": 0.251953125, + "learning_rate": 6.230835365719767e-05, + "loss": 0.9135, + "step": 11515 + }, + { + "epoch": 0.66, + "grad_norm": 0.263671875, + "learning_rate": 6.22156122430023e-05, + "loss": 0.9519, + "step": 11520 + }, + { + "epoch": 0.66, + "grad_norm": 0.259765625, + "learning_rate": 6.212290872390722e-05, + "loss": 0.9276, + "step": 11525 + }, + { + "epoch": 0.66, + "grad_norm": 0.2490234375, + "learning_rate": 6.203024319288762e-05, + "loss": 0.8321, + "step": 11530 + }, + { + "epoch": 0.66, + "grad_norm": 0.255859375, + "learning_rate": 6.193761574288057e-05, + "loss": 0.9405, + "step": 11535 + }, + { + "epoch": 0.66, + "grad_norm": 0.265625, + "learning_rate": 6.184502646678486e-05, + "loss": 0.9904, + "step": 11540 + }, + { + "epoch": 0.66, + "grad_norm": 0.26171875, + "learning_rate": 6.175247545746116e-05, + "loss": 1.006, + "step": 11545 + }, + { + "epoch": 0.66, + "grad_norm": 0.2578125, + "learning_rate": 6.165996280773157e-05, + "loss": 0.8975, + "step": 11550 + }, + { + "epoch": 0.66, + "grad_norm": 0.28515625, + "learning_rate": 6.156748861037991e-05, + "loss": 0.9428, + "step": 11555 + }, + { + "epoch": 0.66, + "grad_norm": 0.2578125, + "learning_rate": 6.147505295815124e-05, + "loss": 0.923, + "step": 11560 + }, + { + "epoch": 0.66, + "grad_norm": 0.26171875, + "learning_rate": 6.138265594375212e-05, + "loss": 1.0096, + "step": 11565 + }, + { + "epoch": 0.66, + "grad_norm": 0.259765625, + "learning_rate": 6.129029765985028e-05, + "loss": 0.9237, + "step": 11570 + }, + { + "epoch": 0.66, + "grad_norm": 0.26171875, + "learning_rate": 6.119797819907463e-05, + "loss": 0.925, + "step": 11575 + }, + { + "epoch": 0.66, + "grad_norm": 0.265625, + "learning_rate": 6.110569765401513e-05, + "loss": 0.9749, + "step": 11580 + }, + { + "epoch": 0.66, + "grad_norm": 0.26171875, + "learning_rate": 6.1013456117222686e-05, + "loss": 0.9196, + "step": 11585 + }, + { + "epoch": 0.67, + "grad_norm": 0.2734375, + "learning_rate": 6.092125368120921e-05, + "loss": 0.8837, + "step": 11590 + }, + { + "epoch": 0.67, + "grad_norm": 0.28125, + "learning_rate": 6.082909043844719e-05, + "loss": 0.9113, + "step": 11595 + }, + { + "epoch": 0.67, + "grad_norm": 0.24609375, + "learning_rate": 6.073696648137001e-05, + "loss": 0.9584, + "step": 11600 + }, + { + "epoch": 0.67, + "grad_norm": 0.27734375, + "learning_rate": 6.0644881902371474e-05, + "loss": 0.9302, + "step": 11605 + }, + { + "epoch": 0.67, + "grad_norm": 0.263671875, + "learning_rate": 6.055283679380605e-05, + "loss": 0.9446, + "step": 11610 + }, + { + "epoch": 0.67, + "grad_norm": 0.263671875, + "learning_rate": 6.046083124798851e-05, + "loss": 1.0009, + "step": 11615 + }, + { + "epoch": 0.67, + "grad_norm": 0.271484375, + "learning_rate": 6.036886535719399e-05, + "loss": 0.9066, + "step": 11620 + }, + { + "epoch": 0.67, + "grad_norm": 0.259765625, + "learning_rate": 6.027693921365789e-05, + "loss": 0.9608, + "step": 11625 + }, + { + "epoch": 0.67, + "grad_norm": 0.25390625, + "learning_rate": 6.018505290957565e-05, + "loss": 0.9562, + "step": 11630 + }, + { + "epoch": 0.67, + "grad_norm": 0.25390625, + "learning_rate": 6.0093206537102866e-05, + "loss": 0.9836, + "step": 11635 + }, + { + "epoch": 0.67, + "grad_norm": 0.244140625, + "learning_rate": 6.000140018835497e-05, + "loss": 0.9908, + "step": 11640 + }, + { + "epoch": 0.67, + "grad_norm": 0.26953125, + "learning_rate": 5.990963395540739e-05, + "loss": 0.9516, + "step": 11645 + }, + { + "epoch": 0.67, + "grad_norm": 0.251953125, + "learning_rate": 5.9817907930295155e-05, + "loss": 0.934, + "step": 11650 + }, + { + "epoch": 0.67, + "grad_norm": 0.26953125, + "learning_rate": 5.972622220501315e-05, + "loss": 0.9158, + "step": 11655 + }, + { + "epoch": 0.67, + "grad_norm": 0.265625, + "learning_rate": 5.9634576871515656e-05, + "loss": 0.8874, + "step": 11660 + }, + { + "epoch": 0.67, + "grad_norm": 0.263671875, + "learning_rate": 5.9542972021716616e-05, + "loss": 0.9071, + "step": 11665 + }, + { + "epoch": 0.67, + "grad_norm": 0.265625, + "learning_rate": 5.945140774748929e-05, + "loss": 0.9595, + "step": 11670 + }, + { + "epoch": 0.67, + "grad_norm": 0.26953125, + "learning_rate": 5.935988414066617e-05, + "loss": 0.9756, + "step": 11675 + }, + { + "epoch": 0.67, + "grad_norm": 0.259765625, + "learning_rate": 5.9268401293039125e-05, + "loss": 1.0034, + "step": 11680 + }, + { + "epoch": 0.67, + "grad_norm": 0.26171875, + "learning_rate": 5.917695929635898e-05, + "loss": 0.8618, + "step": 11685 + }, + { + "epoch": 0.67, + "grad_norm": 0.26953125, + "learning_rate": 5.908555824233575e-05, + "loss": 0.9354, + "step": 11690 + }, + { + "epoch": 0.67, + "grad_norm": 0.26171875, + "learning_rate": 5.899419822263822e-05, + "loss": 0.9256, + "step": 11695 + }, + { + "epoch": 0.67, + "grad_norm": 0.26953125, + "learning_rate": 5.8902879328894156e-05, + "loss": 0.8939, + "step": 11700 + }, + { + "epoch": 0.67, + "grad_norm": 0.263671875, + "learning_rate": 5.881160165269004e-05, + "loss": 0.9736, + "step": 11705 + }, + { + "epoch": 0.67, + "grad_norm": 0.28515625, + "learning_rate": 5.872036528557096e-05, + "loss": 0.9653, + "step": 11710 + }, + { + "epoch": 0.67, + "grad_norm": 0.28125, + "learning_rate": 5.862917031904066e-05, + "loss": 0.9689, + "step": 11715 + }, + { + "epoch": 0.67, + "grad_norm": 0.2578125, + "learning_rate": 5.853801684456126e-05, + "loss": 0.9334, + "step": 11720 + }, + { + "epoch": 0.67, + "grad_norm": 0.287109375, + "learning_rate": 5.844690495355338e-05, + "loss": 0.9702, + "step": 11725 + }, + { + "epoch": 0.67, + "grad_norm": 0.259765625, + "learning_rate": 5.8355834737395856e-05, + "loss": 0.8879, + "step": 11730 + }, + { + "epoch": 0.67, + "grad_norm": 0.2431640625, + "learning_rate": 5.8264806287425724e-05, + "loss": 0.9326, + "step": 11735 + }, + { + "epoch": 0.67, + "grad_norm": 0.26171875, + "learning_rate": 5.817381969493823e-05, + "loss": 0.9541, + "step": 11740 + }, + { + "epoch": 0.67, + "grad_norm": 0.251953125, + "learning_rate": 5.808287505118647e-05, + "loss": 0.9941, + "step": 11745 + }, + { + "epoch": 0.67, + "grad_norm": 0.26953125, + "learning_rate": 5.799197244738166e-05, + "loss": 0.9653, + "step": 11750 + }, + { + "epoch": 0.67, + "grad_norm": 0.259765625, + "learning_rate": 5.790111197469269e-05, + "loss": 0.9109, + "step": 11755 + }, + { + "epoch": 0.67, + "grad_norm": 0.27734375, + "learning_rate": 5.781029372424633e-05, + "loss": 1.0813, + "step": 11760 + }, + { + "epoch": 0.68, + "grad_norm": 0.2734375, + "learning_rate": 5.7719517787126856e-05, + "loss": 0.9185, + "step": 11765 + }, + { + "epoch": 0.68, + "grad_norm": 0.25390625, + "learning_rate": 5.762878425437627e-05, + "loss": 0.842, + "step": 11770 + }, + { + "epoch": 0.68, + "grad_norm": 0.244140625, + "learning_rate": 5.753809321699388e-05, + "loss": 0.9481, + "step": 11775 + }, + { + "epoch": 0.68, + "grad_norm": 0.26171875, + "learning_rate": 5.744744476593652e-05, + "loss": 0.9382, + "step": 11780 + }, + { + "epoch": 0.68, + "grad_norm": 0.255859375, + "learning_rate": 5.7356838992118277e-05, + "loss": 1.0021, + "step": 11785 + }, + { + "epoch": 0.68, + "grad_norm": 0.267578125, + "learning_rate": 5.7266275986410324e-05, + "loss": 1.0197, + "step": 11790 + }, + { + "epoch": 0.68, + "grad_norm": 0.251953125, + "learning_rate": 5.717575583964111e-05, + "loss": 0.9619, + "step": 11795 + }, + { + "epoch": 0.68, + "grad_norm": 0.26171875, + "learning_rate": 5.708527864259594e-05, + "loss": 0.9145, + "step": 11800 + }, + { + "epoch": 0.68, + "grad_norm": 0.263671875, + "learning_rate": 5.6994844486017204e-05, + "loss": 0.865, + "step": 11805 + }, + { + "epoch": 0.68, + "grad_norm": 0.255859375, + "learning_rate": 5.6904453460603955e-05, + "loss": 1.0117, + "step": 11810 + }, + { + "epoch": 0.68, + "grad_norm": 0.259765625, + "learning_rate": 5.681410565701215e-05, + "loss": 0.9478, + "step": 11815 + }, + { + "epoch": 0.68, + "grad_norm": 0.271484375, + "learning_rate": 5.672380116585425e-05, + "loss": 0.9194, + "step": 11820 + }, + { + "epoch": 0.68, + "grad_norm": 0.263671875, + "learning_rate": 5.663354007769943e-05, + "loss": 0.9231, + "step": 11825 + }, + { + "epoch": 0.68, + "grad_norm": 0.251953125, + "learning_rate": 5.654332248307319e-05, + "loss": 1.0146, + "step": 11830 + }, + { + "epoch": 0.68, + "grad_norm": 0.26171875, + "learning_rate": 5.6453148472457476e-05, + "loss": 0.923, + "step": 11835 + }, + { + "epoch": 0.68, + "grad_norm": 0.259765625, + "learning_rate": 5.636301813629057e-05, + "loss": 0.9278, + "step": 11840 + }, + { + "epoch": 0.68, + "grad_norm": 0.255859375, + "learning_rate": 5.62729315649668e-05, + "loss": 0.9252, + "step": 11845 + }, + { + "epoch": 0.68, + "grad_norm": 0.259765625, + "learning_rate": 5.618288884883684e-05, + "loss": 0.8796, + "step": 11850 + }, + { + "epoch": 0.68, + "grad_norm": 0.271484375, + "learning_rate": 5.6092890078207107e-05, + "loss": 0.9037, + "step": 11855 + }, + { + "epoch": 0.68, + "grad_norm": 0.26171875, + "learning_rate": 5.600293534334014e-05, + "loss": 0.8821, + "step": 11860 + }, + { + "epoch": 0.68, + "grad_norm": 0.26171875, + "learning_rate": 5.591302473445429e-05, + "loss": 0.9258, + "step": 11865 + }, + { + "epoch": 0.68, + "grad_norm": 0.259765625, + "learning_rate": 5.582315834172353e-05, + "loss": 0.9576, + "step": 11870 + }, + { + "epoch": 0.68, + "grad_norm": 0.263671875, + "learning_rate": 5.573333625527767e-05, + "loss": 0.9547, + "step": 11875 + }, + { + "epoch": 0.68, + "grad_norm": 0.2578125, + "learning_rate": 5.564355856520189e-05, + "loss": 0.9878, + "step": 11880 + }, + { + "epoch": 0.68, + "grad_norm": 0.25390625, + "learning_rate": 5.555382536153702e-05, + "loss": 0.8999, + "step": 11885 + }, + { + "epoch": 0.68, + "grad_norm": 0.248046875, + "learning_rate": 5.5464136734279094e-05, + "loss": 0.9551, + "step": 11890 + }, + { + "epoch": 0.68, + "grad_norm": 0.2578125, + "learning_rate": 5.537449277337965e-05, + "loss": 0.9651, + "step": 11895 + }, + { + "epoch": 0.68, + "grad_norm": 0.240234375, + "learning_rate": 5.528489356874522e-05, + "loss": 0.9024, + "step": 11900 + }, + { + "epoch": 0.68, + "grad_norm": 0.25, + "learning_rate": 5.5195339210237626e-05, + "loss": 0.9697, + "step": 11905 + }, + { + "epoch": 0.68, + "grad_norm": 0.251953125, + "learning_rate": 5.510582978767356e-05, + "loss": 0.99, + "step": 11910 + }, + { + "epoch": 0.68, + "grad_norm": 0.291015625, + "learning_rate": 5.501636539082478e-05, + "loss": 0.9663, + "step": 11915 + }, + { + "epoch": 0.68, + "grad_norm": 0.2333984375, + "learning_rate": 5.4926946109417775e-05, + "loss": 0.9425, + "step": 11920 + }, + { + "epoch": 0.68, + "grad_norm": 0.251953125, + "learning_rate": 5.483757203313383e-05, + "loss": 1.0466, + "step": 11925 + }, + { + "epoch": 0.68, + "grad_norm": 0.26953125, + "learning_rate": 5.4748243251608965e-05, + "loss": 0.9748, + "step": 11930 + }, + { + "epoch": 0.68, + "grad_norm": 0.251953125, + "learning_rate": 5.465895985443361e-05, + "loss": 0.9863, + "step": 11935 + }, + { + "epoch": 0.69, + "grad_norm": 0.248046875, + "learning_rate": 5.4569721931152864e-05, + "loss": 0.8764, + "step": 11940 + }, + { + "epoch": 0.69, + "grad_norm": 0.28515625, + "learning_rate": 5.448052957126606e-05, + "loss": 1.0002, + "step": 11945 + }, + { + "epoch": 0.69, + "grad_norm": 0.28125, + "learning_rate": 5.4391382864226916e-05, + "loss": 0.9917, + "step": 11950 + }, + { + "epoch": 0.69, + "grad_norm": 0.248046875, + "learning_rate": 5.4302281899443394e-05, + "loss": 0.9495, + "step": 11955 + }, + { + "epoch": 0.69, + "grad_norm": 0.291015625, + "learning_rate": 5.421322676627747e-05, + "loss": 0.9429, + "step": 11960 + }, + { + "epoch": 0.69, + "grad_norm": 0.263671875, + "learning_rate": 5.412421755404529e-05, + "loss": 1.0265, + "step": 11965 + }, + { + "epoch": 0.69, + "grad_norm": 0.25, + "learning_rate": 5.40352543520168e-05, + "loss": 0.9534, + "step": 11970 + }, + { + "epoch": 0.69, + "grad_norm": 0.23828125, + "learning_rate": 5.3946337249415936e-05, + "loss": 0.9935, + "step": 11975 + }, + { + "epoch": 0.69, + "grad_norm": 0.283203125, + "learning_rate": 5.385746633542027e-05, + "loss": 0.9642, + "step": 11980 + }, + { + "epoch": 0.69, + "grad_norm": 0.279296875, + "learning_rate": 5.376864169916116e-05, + "loss": 0.9807, + "step": 11985 + }, + { + "epoch": 0.69, + "grad_norm": 0.267578125, + "learning_rate": 5.367986342972355e-05, + "loss": 0.9962, + "step": 11990 + }, + { + "epoch": 0.69, + "grad_norm": 0.30859375, + "learning_rate": 5.359113161614576e-05, + "loss": 0.924, + "step": 11995 + }, + { + "epoch": 0.69, + "grad_norm": 0.26953125, + "learning_rate": 5.3502446347419674e-05, + "loss": 0.9266, + "step": 12000 + }, + { + "epoch": 0.69, + "grad_norm": 0.26171875, + "learning_rate": 5.341380771249037e-05, + "loss": 0.9088, + "step": 12005 + }, + { + "epoch": 0.69, + "grad_norm": 0.259765625, + "learning_rate": 5.332521580025622e-05, + "loss": 1.0162, + "step": 12010 + }, + { + "epoch": 0.69, + "grad_norm": 0.26953125, + "learning_rate": 5.323667069956868e-05, + "loss": 0.954, + "step": 12015 + }, + { + "epoch": 0.69, + "grad_norm": 0.25390625, + "learning_rate": 5.314817249923236e-05, + "loss": 0.9517, + "step": 12020 + }, + { + "epoch": 0.69, + "grad_norm": 0.25, + "learning_rate": 5.3059721288004714e-05, + "loss": 0.9392, + "step": 12025 + }, + { + "epoch": 0.69, + "grad_norm": 0.267578125, + "learning_rate": 5.297131715459614e-05, + "loss": 0.9615, + "step": 12030 + }, + { + "epoch": 0.69, + "grad_norm": 0.2255859375, + "learning_rate": 5.288296018766987e-05, + "loss": 1.0129, + "step": 12035 + }, + { + "epoch": 0.69, + "grad_norm": 0.287109375, + "learning_rate": 5.2794650475841664e-05, + "loss": 0.9961, + "step": 12040 + }, + { + "epoch": 0.69, + "grad_norm": 0.240234375, + "learning_rate": 5.2706388107680095e-05, + "loss": 0.8726, + "step": 12045 + }, + { + "epoch": 0.69, + "grad_norm": 0.279296875, + "learning_rate": 5.2618173171706064e-05, + "loss": 0.9517, + "step": 12050 + }, + { + "epoch": 0.69, + "grad_norm": 0.267578125, + "learning_rate": 5.253000575639305e-05, + "loss": 1.0232, + "step": 12055 + }, + { + "epoch": 0.69, + "grad_norm": 0.26171875, + "learning_rate": 5.2441885950166746e-05, + "loss": 0.9498, + "step": 12060 + }, + { + "epoch": 0.69, + "grad_norm": 0.244140625, + "learning_rate": 5.235381384140519e-05, + "loss": 0.9127, + "step": 12065 + }, + { + "epoch": 0.69, + "grad_norm": 0.271484375, + "learning_rate": 5.226578951843859e-05, + "loss": 0.9645, + "step": 12070 + }, + { + "epoch": 0.69, + "grad_norm": 0.25, + "learning_rate": 5.217781306954912e-05, + "loss": 0.906, + "step": 12075 + }, + { + "epoch": 0.69, + "grad_norm": 0.265625, + "learning_rate": 5.208988458297109e-05, + "loss": 0.9287, + "step": 12080 + }, + { + "epoch": 0.69, + "grad_norm": 0.25390625, + "learning_rate": 5.2002004146890535e-05, + "loss": 0.9332, + "step": 12085 + }, + { + "epoch": 0.69, + "grad_norm": 0.263671875, + "learning_rate": 5.191417184944549e-05, + "loss": 0.9164, + "step": 12090 + }, + { + "epoch": 0.69, + "grad_norm": 0.26171875, + "learning_rate": 5.182638777872555e-05, + "loss": 0.8766, + "step": 12095 + }, + { + "epoch": 0.69, + "grad_norm": 0.279296875, + "learning_rate": 5.1738652022771974e-05, + "loss": 0.9041, + "step": 12100 + }, + { + "epoch": 0.69, + "grad_norm": 0.275390625, + "learning_rate": 5.165096466957769e-05, + "loss": 0.9226, + "step": 12105 + }, + { + "epoch": 0.69, + "grad_norm": 0.267578125, + "learning_rate": 5.1563325807086856e-05, + "loss": 0.8697, + "step": 12110 + }, + { + "epoch": 0.7, + "grad_norm": 0.26171875, + "learning_rate": 5.147573552319526e-05, + "loss": 0.8859, + "step": 12115 + }, + { + "epoch": 0.7, + "grad_norm": 0.2890625, + "learning_rate": 5.138819390574972e-05, + "loss": 0.8888, + "step": 12120 + }, + { + "epoch": 0.7, + "grad_norm": 0.251953125, + "learning_rate": 5.130070104254847e-05, + "loss": 0.9413, + "step": 12125 + }, + { + "epoch": 0.7, + "grad_norm": 0.2578125, + "learning_rate": 5.121325702134063e-05, + "loss": 0.944, + "step": 12130 + }, + { + "epoch": 0.7, + "grad_norm": 0.28125, + "learning_rate": 5.112586192982653e-05, + "loss": 0.9692, + "step": 12135 + }, + { + "epoch": 0.7, + "grad_norm": 0.25390625, + "learning_rate": 5.1038515855657264e-05, + "loss": 0.9133, + "step": 12140 + }, + { + "epoch": 0.7, + "grad_norm": 0.2734375, + "learning_rate": 5.0951218886434884e-05, + "loss": 0.969, + "step": 12145 + }, + { + "epoch": 0.7, + "grad_norm": 0.26171875, + "learning_rate": 5.086397110971218e-05, + "loss": 0.9552, + "step": 12150 + }, + { + "epoch": 0.7, + "grad_norm": 0.265625, + "learning_rate": 5.077677261299251e-05, + "loss": 0.8627, + "step": 12155 + }, + { + "epoch": 0.7, + "grad_norm": 0.26171875, + "learning_rate": 5.068962348372992e-05, + "loss": 0.9822, + "step": 12160 + }, + { + "epoch": 0.7, + "grad_norm": 0.26171875, + "learning_rate": 5.060252380932886e-05, + "loss": 0.936, + "step": 12165 + }, + { + "epoch": 0.7, + "grad_norm": 0.248046875, + "learning_rate": 5.0515473677144254e-05, + "loss": 0.9528, + "step": 12170 + }, + { + "epoch": 0.7, + "grad_norm": 0.25, + "learning_rate": 5.042847317448125e-05, + "loss": 0.9435, + "step": 12175 + }, + { + "epoch": 0.7, + "grad_norm": 0.26171875, + "learning_rate": 5.034152238859533e-05, + "loss": 0.9473, + "step": 12180 + }, + { + "epoch": 0.7, + "grad_norm": 0.25, + "learning_rate": 5.025462140669204e-05, + "loss": 0.937, + "step": 12185 + }, + { + "epoch": 0.7, + "grad_norm": 0.265625, + "learning_rate": 5.016777031592694e-05, + "loss": 0.9567, + "step": 12190 + }, + { + "epoch": 0.7, + "grad_norm": 0.251953125, + "learning_rate": 5.008096920340568e-05, + "loss": 0.9716, + "step": 12195 + }, + { + "epoch": 0.7, + "grad_norm": 0.265625, + "learning_rate": 4.999421815618364e-05, + "loss": 0.9504, + "step": 12200 + }, + { + "epoch": 0.7, + "grad_norm": 0.2490234375, + "learning_rate": 4.990751726126612e-05, + "loss": 0.9038, + "step": 12205 + }, + { + "epoch": 0.7, + "grad_norm": 0.25390625, + "learning_rate": 4.9820866605607994e-05, + "loss": 0.8834, + "step": 12210 + }, + { + "epoch": 0.7, + "grad_norm": 0.27734375, + "learning_rate": 4.973426627611389e-05, + "loss": 0.9035, + "step": 12215 + }, + { + "epoch": 0.7, + "grad_norm": 0.25390625, + "learning_rate": 4.964771635963781e-05, + "loss": 0.9357, + "step": 12220 + }, + { + "epoch": 0.7, + "grad_norm": 0.259765625, + "learning_rate": 4.95612169429833e-05, + "loss": 0.9755, + "step": 12225 + }, + { + "epoch": 0.7, + "grad_norm": 0.240234375, + "learning_rate": 4.9474768112903293e-05, + "loss": 0.9856, + "step": 12230 + }, + { + "epoch": 0.7, + "grad_norm": 0.2431640625, + "learning_rate": 4.9388369956099815e-05, + "loss": 0.898, + "step": 12235 + }, + { + "epoch": 0.7, + "grad_norm": 0.2578125, + "learning_rate": 4.930202255922427e-05, + "loss": 0.9254, + "step": 12240 + }, + { + "epoch": 0.7, + "grad_norm": 0.26953125, + "learning_rate": 4.9215726008876995e-05, + "loss": 0.945, + "step": 12245 + }, + { + "epoch": 0.7, + "grad_norm": 0.26171875, + "learning_rate": 4.9129480391607465e-05, + "loss": 0.9477, + "step": 12250 + }, + { + "epoch": 0.7, + "grad_norm": 0.265625, + "learning_rate": 4.904328579391393e-05, + "loss": 0.9186, + "step": 12255 + }, + { + "epoch": 0.7, + "grad_norm": 0.2421875, + "learning_rate": 4.895714230224363e-05, + "loss": 0.9673, + "step": 12260 + }, + { + "epoch": 0.7, + "grad_norm": 0.265625, + "learning_rate": 4.887105000299239e-05, + "loss": 1.0013, + "step": 12265 + }, + { + "epoch": 0.7, + "grad_norm": 0.25390625, + "learning_rate": 4.8785008982504845e-05, + "loss": 0.9292, + "step": 12270 + }, + { + "epoch": 0.7, + "grad_norm": 0.267578125, + "learning_rate": 4.8699019327074035e-05, + "loss": 0.9545, + "step": 12275 + }, + { + "epoch": 0.7, + "grad_norm": 0.2421875, + "learning_rate": 4.861308112294168e-05, + "loss": 1.0284, + "step": 12280 + }, + { + "epoch": 0.7, + "grad_norm": 0.28125, + "learning_rate": 4.852719445629773e-05, + "loss": 1.0357, + "step": 12285 + }, + { + "epoch": 0.71, + "grad_norm": 0.244140625, + "learning_rate": 4.844135941328048e-05, + "loss": 0.9705, + "step": 12290 + }, + { + "epoch": 0.71, + "grad_norm": 0.279296875, + "learning_rate": 4.835557607997656e-05, + "loss": 1.0149, + "step": 12295 + }, + { + "epoch": 0.71, + "grad_norm": 0.259765625, + "learning_rate": 4.826984454242057e-05, + "loss": 0.9071, + "step": 12300 + }, + { + "epoch": 0.71, + "grad_norm": 0.271484375, + "learning_rate": 4.818416488659534e-05, + "loss": 1.006, + "step": 12305 + }, + { + "epoch": 0.71, + "grad_norm": 0.255859375, + "learning_rate": 4.80985371984315e-05, + "loss": 1.007, + "step": 12310 + }, + { + "epoch": 0.71, + "grad_norm": 0.26171875, + "learning_rate": 4.801296156380767e-05, + "loss": 0.9192, + "step": 12315 + }, + { + "epoch": 0.71, + "grad_norm": 0.275390625, + "learning_rate": 4.7927438068550256e-05, + "loss": 0.991, + "step": 12320 + }, + { + "epoch": 0.71, + "grad_norm": 0.265625, + "learning_rate": 4.78419667984333e-05, + "loss": 0.9496, + "step": 12325 + }, + { + "epoch": 0.71, + "grad_norm": 0.251953125, + "learning_rate": 4.7756547839178564e-05, + "loss": 0.9835, + "step": 12330 + }, + { + "epoch": 0.71, + "grad_norm": 0.26171875, + "learning_rate": 4.767118127645524e-05, + "loss": 0.9501, + "step": 12335 + }, + { + "epoch": 0.71, + "grad_norm": 0.25390625, + "learning_rate": 4.758586719588007e-05, + "loss": 0.9286, + "step": 12340 + }, + { + "epoch": 0.71, + "grad_norm": 0.251953125, + "learning_rate": 4.7500605683017076e-05, + "loss": 0.9161, + "step": 12345 + }, + { + "epoch": 0.71, + "grad_norm": 0.255859375, + "learning_rate": 4.74153968233776e-05, + "loss": 0.9634, + "step": 12350 + }, + { + "epoch": 0.71, + "grad_norm": 0.2373046875, + "learning_rate": 4.733024070242024e-05, + "loss": 0.8658, + "step": 12355 + }, + { + "epoch": 0.71, + "grad_norm": 0.263671875, + "learning_rate": 4.724513740555053e-05, + "loss": 0.9709, + "step": 12360 + }, + { + "epoch": 0.71, + "grad_norm": 0.263671875, + "learning_rate": 4.716008701812123e-05, + "loss": 0.8628, + "step": 12365 + }, + { + "epoch": 0.71, + "grad_norm": 0.25390625, + "learning_rate": 4.707508962543188e-05, + "loss": 0.9155, + "step": 12370 + }, + { + "epoch": 0.71, + "grad_norm": 0.28125, + "learning_rate": 4.699014531272894e-05, + "loss": 0.9683, + "step": 12375 + }, + { + "epoch": 0.71, + "grad_norm": 0.2734375, + "learning_rate": 4.690525416520557e-05, + "loss": 1.0008, + "step": 12380 + }, + { + "epoch": 0.71, + "grad_norm": 0.275390625, + "learning_rate": 4.6820416268001747e-05, + "loss": 0.9406, + "step": 12385 + }, + { + "epoch": 0.71, + "grad_norm": 0.2890625, + "learning_rate": 4.673563170620385e-05, + "loss": 0.8862, + "step": 12390 + }, + { + "epoch": 0.71, + "grad_norm": 0.25390625, + "learning_rate": 4.6650900564844935e-05, + "loss": 0.9025, + "step": 12395 + }, + { + "epoch": 0.71, + "grad_norm": 0.2490234375, + "learning_rate": 4.6566222928904436e-05, + "loss": 0.9169, + "step": 12400 + }, + { + "epoch": 0.71, + "grad_norm": 0.267578125, + "learning_rate": 4.648159888330804e-05, + "loss": 0.9738, + "step": 12405 + }, + { + "epoch": 0.71, + "grad_norm": 0.2431640625, + "learning_rate": 4.639702851292782e-05, + "loss": 0.9669, + "step": 12410 + }, + { + "epoch": 0.71, + "grad_norm": 0.2412109375, + "learning_rate": 4.631251190258187e-05, + "loss": 0.9728, + "step": 12415 + }, + { + "epoch": 0.71, + "grad_norm": 0.2470703125, + "learning_rate": 4.622804913703452e-05, + "loss": 0.9234, + "step": 12420 + }, + { + "epoch": 0.71, + "grad_norm": 0.248046875, + "learning_rate": 4.614364030099596e-05, + "loss": 0.94, + "step": 12425 + }, + { + "epoch": 0.71, + "grad_norm": 0.265625, + "learning_rate": 4.605928547912237e-05, + "loss": 0.9276, + "step": 12430 + }, + { + "epoch": 0.71, + "grad_norm": 0.267578125, + "learning_rate": 4.597498475601579e-05, + "loss": 0.9564, + "step": 12435 + }, + { + "epoch": 0.71, + "grad_norm": 0.255859375, + "learning_rate": 4.5890738216223884e-05, + "loss": 1.0038, + "step": 12440 + }, + { + "epoch": 0.71, + "grad_norm": 0.279296875, + "learning_rate": 4.58065459442401e-05, + "loss": 1.0051, + "step": 12445 + }, + { + "epoch": 0.71, + "grad_norm": 0.265625, + "learning_rate": 4.572240802450335e-05, + "loss": 0.901, + "step": 12450 + }, + { + "epoch": 0.71, + "grad_norm": 0.25390625, + "learning_rate": 4.5638324541398136e-05, + "loss": 0.9423, + "step": 12455 + }, + { + "epoch": 0.71, + "grad_norm": 0.26171875, + "learning_rate": 4.55542955792543e-05, + "loss": 0.9089, + "step": 12460 + }, + { + "epoch": 0.72, + "grad_norm": 0.271484375, + "learning_rate": 4.547032122234698e-05, + "loss": 0.9291, + "step": 12465 + }, + { + "epoch": 0.72, + "grad_norm": 0.255859375, + "learning_rate": 4.538640155489666e-05, + "loss": 0.9689, + "step": 12470 + }, + { + "epoch": 0.72, + "grad_norm": 0.333984375, + "learning_rate": 4.5302536661068816e-05, + "loss": 0.9536, + "step": 12475 + }, + { + "epoch": 0.72, + "grad_norm": 0.228515625, + "learning_rate": 4.521872662497416e-05, + "loss": 0.8805, + "step": 12480 + }, + { + "epoch": 0.72, + "grad_norm": 0.23828125, + "learning_rate": 4.513497153066822e-05, + "loss": 0.8875, + "step": 12485 + }, + { + "epoch": 0.72, + "grad_norm": 0.263671875, + "learning_rate": 4.505127146215159e-05, + "loss": 0.9778, + "step": 12490 + }, + { + "epoch": 0.72, + "grad_norm": 0.2431640625, + "learning_rate": 4.49676265033695e-05, + "loss": 0.9782, + "step": 12495 + }, + { + "epoch": 0.72, + "grad_norm": 0.25, + "learning_rate": 4.4884036738212074e-05, + "loss": 0.9139, + "step": 12500 + }, + { + "epoch": 0.72, + "grad_norm": 0.275390625, + "learning_rate": 4.480050225051394e-05, + "loss": 0.9136, + "step": 12505 + }, + { + "epoch": 0.72, + "grad_norm": 0.2578125, + "learning_rate": 4.4717023124054394e-05, + "loss": 0.9731, + "step": 12510 + }, + { + "epoch": 0.72, + "grad_norm": 0.25, + "learning_rate": 4.463359944255718e-05, + "loss": 0.9265, + "step": 12515 + }, + { + "epoch": 0.72, + "grad_norm": 0.271484375, + "learning_rate": 4.455023128969036e-05, + "loss": 0.9152, + "step": 12520 + }, + { + "epoch": 0.72, + "grad_norm": 0.271484375, + "learning_rate": 4.446691874906645e-05, + "loss": 0.9031, + "step": 12525 + }, + { + "epoch": 0.72, + "grad_norm": 0.248046875, + "learning_rate": 4.4383661904242e-05, + "loss": 0.9367, + "step": 12530 + }, + { + "epoch": 0.72, + "grad_norm": 0.28125, + "learning_rate": 4.430046083871791e-05, + "loss": 0.9354, + "step": 12535 + }, + { + "epoch": 0.72, + "grad_norm": 0.26953125, + "learning_rate": 4.421731563593895e-05, + "loss": 0.959, + "step": 12540 + }, + { + "epoch": 0.72, + "grad_norm": 0.2578125, + "learning_rate": 4.413422637929402e-05, + "loss": 0.9779, + "step": 12545 + }, + { + "epoch": 0.72, + "grad_norm": 0.25390625, + "learning_rate": 4.40511931521158e-05, + "loss": 0.9275, + "step": 12550 + }, + { + "epoch": 0.72, + "grad_norm": 0.259765625, + "learning_rate": 4.396821603768079e-05, + "loss": 0.9925, + "step": 12555 + }, + { + "epoch": 0.72, + "grad_norm": 0.2578125, + "learning_rate": 4.3885295119209294e-05, + "loss": 0.9756, + "step": 12560 + }, + { + "epoch": 0.72, + "grad_norm": 0.25, + "learning_rate": 4.380243047986513e-05, + "loss": 1.0194, + "step": 12565 + }, + { + "epoch": 0.72, + "grad_norm": 0.265625, + "learning_rate": 4.3719622202755816e-05, + "loss": 0.9061, + "step": 12570 + }, + { + "epoch": 0.72, + "grad_norm": 0.275390625, + "learning_rate": 4.3636870370932194e-05, + "loss": 0.9297, + "step": 12575 + }, + { + "epoch": 0.72, + "grad_norm": 0.259765625, + "learning_rate": 4.3554175067388636e-05, + "loss": 0.9902, + "step": 12580 + }, + { + "epoch": 0.72, + "grad_norm": 0.2431640625, + "learning_rate": 4.3471536375062696e-05, + "loss": 0.9526, + "step": 12585 + }, + { + "epoch": 0.72, + "grad_norm": 0.27734375, + "learning_rate": 4.338895437683521e-05, + "loss": 0.9041, + "step": 12590 + }, + { + "epoch": 0.72, + "grad_norm": 0.265625, + "learning_rate": 4.330642915553023e-05, + "loss": 0.9641, + "step": 12595 + }, + { + "epoch": 0.72, + "grad_norm": 0.259765625, + "learning_rate": 4.322396079391467e-05, + "loss": 0.9234, + "step": 12600 + }, + { + "epoch": 0.72, + "grad_norm": 0.265625, + "learning_rate": 4.3141549374698645e-05, + "loss": 0.8561, + "step": 12605 + }, + { + "epoch": 0.72, + "grad_norm": 0.265625, + "learning_rate": 4.305919498053495e-05, + "loss": 0.9899, + "step": 12610 + }, + { + "epoch": 0.72, + "grad_norm": 0.265625, + "learning_rate": 4.2976897694019356e-05, + "loss": 0.9444, + "step": 12615 + }, + { + "epoch": 0.72, + "grad_norm": 0.271484375, + "learning_rate": 4.289465759769025e-05, + "loss": 0.9067, + "step": 12620 + }, + { + "epoch": 0.72, + "grad_norm": 0.259765625, + "learning_rate": 4.2812474774028735e-05, + "loss": 0.9728, + "step": 12625 + }, + { + "epoch": 0.72, + "grad_norm": 0.25, + "learning_rate": 4.27303493054584e-05, + "loss": 0.955, + "step": 12630 + }, + { + "epoch": 0.72, + "grad_norm": 0.26171875, + "learning_rate": 4.264828127434539e-05, + "loss": 0.9825, + "step": 12635 + }, + { + "epoch": 0.73, + "grad_norm": 0.2490234375, + "learning_rate": 4.256627076299816e-05, + "loss": 0.9644, + "step": 12640 + }, + { + "epoch": 0.73, + "grad_norm": 0.251953125, + "learning_rate": 4.248431785366759e-05, + "loss": 0.9211, + "step": 12645 + }, + { + "epoch": 0.73, + "grad_norm": 0.267578125, + "learning_rate": 4.2402422628546666e-05, + "loss": 0.9046, + "step": 12650 + }, + { + "epoch": 0.73, + "grad_norm": 0.279296875, + "learning_rate": 4.2320585169770565e-05, + "loss": 0.9737, + "step": 12655 + }, + { + "epoch": 0.73, + "grad_norm": 0.2431640625, + "learning_rate": 4.2238805559416594e-05, + "loss": 0.9187, + "step": 12660 + }, + { + "epoch": 0.73, + "grad_norm": 0.263671875, + "learning_rate": 4.215708387950391e-05, + "loss": 0.9978, + "step": 12665 + }, + { + "epoch": 0.73, + "grad_norm": 0.267578125, + "learning_rate": 4.20754202119937e-05, + "loss": 0.9516, + "step": 12670 + }, + { + "epoch": 0.73, + "grad_norm": 0.251953125, + "learning_rate": 4.1993814638788944e-05, + "loss": 0.9569, + "step": 12675 + }, + { + "epoch": 0.73, + "grad_norm": 0.2392578125, + "learning_rate": 4.191226724173426e-05, + "loss": 0.9413, + "step": 12680 + }, + { + "epoch": 0.73, + "grad_norm": 0.25390625, + "learning_rate": 4.1830778102616055e-05, + "loss": 0.9646, + "step": 12685 + }, + { + "epoch": 0.73, + "grad_norm": 0.2470703125, + "learning_rate": 4.174934730316216e-05, + "loss": 0.9325, + "step": 12690 + }, + { + "epoch": 0.73, + "grad_norm": 0.255859375, + "learning_rate": 4.166797492504206e-05, + "loss": 0.9078, + "step": 12695 + }, + { + "epoch": 0.73, + "grad_norm": 0.2392578125, + "learning_rate": 4.1586661049866496e-05, + "loss": 0.8876, + "step": 12700 + }, + { + "epoch": 0.73, + "grad_norm": 0.26953125, + "learning_rate": 4.1505405759187666e-05, + "loss": 0.9473, + "step": 12705 + }, + { + "epoch": 0.73, + "grad_norm": 0.271484375, + "learning_rate": 4.142420913449887e-05, + "loss": 0.9425, + "step": 12710 + }, + { + "epoch": 0.73, + "grad_norm": 0.259765625, + "learning_rate": 4.13430712572347e-05, + "loss": 0.924, + "step": 12715 + }, + { + "epoch": 0.73, + "grad_norm": 0.259765625, + "learning_rate": 4.12619922087708e-05, + "loss": 0.8847, + "step": 12720 + }, + { + "epoch": 0.73, + "grad_norm": 0.26171875, + "learning_rate": 4.118097207042373e-05, + "loss": 0.8964, + "step": 12725 + }, + { + "epoch": 0.73, + "grad_norm": 0.2392578125, + "learning_rate": 4.1100010923451084e-05, + "loss": 0.928, + "step": 12730 + }, + { + "epoch": 0.73, + "grad_norm": 0.25390625, + "learning_rate": 4.10191088490512e-05, + "loss": 0.958, + "step": 12735 + }, + { + "epoch": 0.73, + "grad_norm": 0.271484375, + "learning_rate": 4.093826592836322e-05, + "loss": 0.9281, + "step": 12740 + }, + { + "epoch": 0.73, + "grad_norm": 0.2578125, + "learning_rate": 4.0857482242466885e-05, + "loss": 0.9126, + "step": 12745 + }, + { + "epoch": 0.73, + "grad_norm": 0.265625, + "learning_rate": 4.077675787238267e-05, + "loss": 0.9593, + "step": 12750 + }, + { + "epoch": 0.73, + "grad_norm": 0.2578125, + "learning_rate": 4.0696092899071416e-05, + "loss": 0.8777, + "step": 12755 + }, + { + "epoch": 0.73, + "grad_norm": 0.25390625, + "learning_rate": 4.061548740343446e-05, + "loss": 0.9838, + "step": 12760 + }, + { + "epoch": 0.73, + "grad_norm": 0.26953125, + "learning_rate": 4.053494146631355e-05, + "loss": 0.9303, + "step": 12765 + }, + { + "epoch": 0.73, + "grad_norm": 0.2490234375, + "learning_rate": 4.045445516849055e-05, + "loss": 0.8684, + "step": 12770 + }, + { + "epoch": 0.73, + "grad_norm": 0.244140625, + "learning_rate": 4.037402859068764e-05, + "loss": 0.9017, + "step": 12775 + }, + { + "epoch": 0.73, + "grad_norm": 0.255859375, + "learning_rate": 4.029366181356702e-05, + "loss": 0.9531, + "step": 12780 + }, + { + "epoch": 0.73, + "grad_norm": 0.26171875, + "learning_rate": 4.0213354917731004e-05, + "loss": 0.9097, + "step": 12785 + }, + { + "epoch": 0.73, + "grad_norm": 0.263671875, + "learning_rate": 4.0133107983721726e-05, + "loss": 0.9436, + "step": 12790 + }, + { + "epoch": 0.73, + "grad_norm": 0.283203125, + "learning_rate": 4.005292109202129e-05, + "loss": 0.9268, + "step": 12795 + }, + { + "epoch": 0.73, + "grad_norm": 0.2578125, + "learning_rate": 3.997279432305158e-05, + "loss": 0.874, + "step": 12800 + }, + { + "epoch": 0.73, + "grad_norm": 0.248046875, + "learning_rate": 3.9892727757174074e-05, + "loss": 0.965, + "step": 12805 + }, + { + "epoch": 0.74, + "grad_norm": 0.259765625, + "learning_rate": 3.981272147469002e-05, + "loss": 0.9038, + "step": 12810 + }, + { + "epoch": 0.74, + "grad_norm": 0.2412109375, + "learning_rate": 3.973277555584004e-05, + "loss": 0.9387, + "step": 12815 + }, + { + "epoch": 0.74, + "grad_norm": 0.259765625, + "learning_rate": 3.965289008080438e-05, + "loss": 0.899, + "step": 12820 + }, + { + "epoch": 0.74, + "grad_norm": 0.2490234375, + "learning_rate": 3.957306512970258e-05, + "loss": 0.9678, + "step": 12825 + }, + { + "epoch": 0.74, + "grad_norm": 0.275390625, + "learning_rate": 3.9493300782593415e-05, + "loss": 1.016, + "step": 12830 + }, + { + "epoch": 0.74, + "grad_norm": 0.2578125, + "learning_rate": 3.9413597119475044e-05, + "loss": 0.9981, + "step": 12835 + }, + { + "epoch": 0.74, + "grad_norm": 0.2490234375, + "learning_rate": 3.9333954220284586e-05, + "loss": 0.9549, + "step": 12840 + }, + { + "epoch": 0.74, + "grad_norm": 0.291015625, + "learning_rate": 3.925437216489838e-05, + "loss": 0.9696, + "step": 12845 + }, + { + "epoch": 0.74, + "grad_norm": 0.271484375, + "learning_rate": 3.91748510331316e-05, + "loss": 1.0139, + "step": 12850 + }, + { + "epoch": 0.74, + "grad_norm": 0.251953125, + "learning_rate": 3.909539090473845e-05, + "loss": 0.9212, + "step": 12855 + }, + { + "epoch": 0.74, + "grad_norm": 0.26953125, + "learning_rate": 3.9015991859411815e-05, + "loss": 1.0049, + "step": 12860 + }, + { + "epoch": 0.74, + "grad_norm": 0.263671875, + "learning_rate": 3.8936653976783454e-05, + "loss": 0.9443, + "step": 12865 + }, + { + "epoch": 0.74, + "grad_norm": 0.2734375, + "learning_rate": 3.885737733642366e-05, + "loss": 0.9899, + "step": 12870 + }, + { + "epoch": 0.74, + "grad_norm": 0.263671875, + "learning_rate": 3.877816201784139e-05, + "loss": 0.9079, + "step": 12875 + }, + { + "epoch": 0.74, + "grad_norm": 0.259765625, + "learning_rate": 3.86990081004841e-05, + "loss": 0.9355, + "step": 12880 + }, + { + "epoch": 0.74, + "grad_norm": 0.2890625, + "learning_rate": 3.861991566373759e-05, + "loss": 0.9222, + "step": 12885 + }, + { + "epoch": 0.74, + "grad_norm": 0.2333984375, + "learning_rate": 3.85408847869261e-05, + "loss": 0.8832, + "step": 12890 + }, + { + "epoch": 0.74, + "grad_norm": 0.26171875, + "learning_rate": 3.846191554931201e-05, + "loss": 0.9305, + "step": 12895 + }, + { + "epoch": 0.74, + "grad_norm": 0.2734375, + "learning_rate": 3.838300803009601e-05, + "loss": 0.939, + "step": 12900 + }, + { + "epoch": 0.74, + "grad_norm": 0.26953125, + "learning_rate": 3.8304162308416766e-05, + "loss": 0.9762, + "step": 12905 + }, + { + "epoch": 0.74, + "grad_norm": 0.2890625, + "learning_rate": 3.822537846335109e-05, + "loss": 0.975, + "step": 12910 + }, + { + "epoch": 0.74, + "grad_norm": 0.267578125, + "learning_rate": 3.814665657391365e-05, + "loss": 0.9453, + "step": 12915 + }, + { + "epoch": 0.74, + "grad_norm": 0.25, + "learning_rate": 3.806799671905695e-05, + "loss": 0.9061, + "step": 12920 + }, + { + "epoch": 0.74, + "grad_norm": 0.263671875, + "learning_rate": 3.798939897767141e-05, + "loss": 0.903, + "step": 12925 + }, + { + "epoch": 0.74, + "grad_norm": 0.25, + "learning_rate": 3.7910863428584985e-05, + "loss": 0.8856, + "step": 12930 + }, + { + "epoch": 0.74, + "grad_norm": 0.25390625, + "learning_rate": 3.783239015056343e-05, + "loss": 0.9018, + "step": 12935 + }, + { + "epoch": 0.74, + "grad_norm": 0.255859375, + "learning_rate": 3.7753979222309876e-05, + "loss": 0.9687, + "step": 12940 + }, + { + "epoch": 0.74, + "grad_norm": 0.251953125, + "learning_rate": 3.767563072246508e-05, + "loss": 0.9176, + "step": 12945 + }, + { + "epoch": 0.74, + "grad_norm": 0.279296875, + "learning_rate": 3.7597344729607056e-05, + "loss": 0.874, + "step": 12950 + }, + { + "epoch": 0.74, + "grad_norm": 0.263671875, + "learning_rate": 3.751912132225118e-05, + "loss": 0.9264, + "step": 12955 + }, + { + "epoch": 0.74, + "grad_norm": 0.265625, + "learning_rate": 3.744096057885014e-05, + "loss": 0.9561, + "step": 12960 + }, + { + "epoch": 0.74, + "grad_norm": 0.263671875, + "learning_rate": 3.73628625777936e-05, + "loss": 0.9813, + "step": 12965 + }, + { + "epoch": 0.74, + "grad_norm": 0.27734375, + "learning_rate": 3.7284827397408485e-05, + "loss": 0.9638, + "step": 12970 + }, + { + "epoch": 0.74, + "grad_norm": 0.271484375, + "learning_rate": 3.720685511595855e-05, + "loss": 0.9361, + "step": 12975 + }, + { + "epoch": 0.74, + "grad_norm": 0.314453125, + "learning_rate": 3.712894581164461e-05, + "loss": 0.9472, + "step": 12980 + }, + { + "epoch": 0.75, + "grad_norm": 0.26171875, + "learning_rate": 3.705109956260419e-05, + "loss": 0.9043, + "step": 12985 + }, + { + "epoch": 0.75, + "grad_norm": 0.2353515625, + "learning_rate": 3.69733164469117e-05, + "loss": 0.98, + "step": 12990 + }, + { + "epoch": 0.75, + "grad_norm": 0.265625, + "learning_rate": 3.68955965425781e-05, + "loss": 0.9291, + "step": 12995 + }, + { + "epoch": 0.75, + "grad_norm": 0.2578125, + "learning_rate": 3.6817939927551105e-05, + "loss": 0.9215, + "step": 13000 + }, + { + "epoch": 0.75, + "grad_norm": 0.259765625, + "learning_rate": 3.67403466797148e-05, + "loss": 0.9316, + "step": 13005 + }, + { + "epoch": 0.75, + "grad_norm": 0.25390625, + "learning_rate": 3.6662816876889837e-05, + "loss": 0.8962, + "step": 13010 + }, + { + "epoch": 0.75, + "grad_norm": 0.259765625, + "learning_rate": 3.658535059683318e-05, + "loss": 0.9216, + "step": 13015 + }, + { + "epoch": 0.75, + "grad_norm": 0.251953125, + "learning_rate": 3.650794791723805e-05, + "loss": 0.8916, + "step": 13020 + }, + { + "epoch": 0.75, + "grad_norm": 0.263671875, + "learning_rate": 3.6430608915734e-05, + "loss": 0.9796, + "step": 13025 + }, + { + "epoch": 0.75, + "grad_norm": 0.248046875, + "learning_rate": 3.635333366988657e-05, + "loss": 0.9619, + "step": 13030 + }, + { + "epoch": 0.75, + "grad_norm": 0.255859375, + "learning_rate": 3.6276122257197465e-05, + "loss": 0.9352, + "step": 13035 + }, + { + "epoch": 0.75, + "grad_norm": 0.255859375, + "learning_rate": 3.6198974755104366e-05, + "loss": 0.8827, + "step": 13040 + }, + { + "epoch": 0.75, + "grad_norm": 0.2578125, + "learning_rate": 3.6121891240980764e-05, + "loss": 0.9641, + "step": 13045 + }, + { + "epoch": 0.75, + "grad_norm": 0.279296875, + "learning_rate": 3.604487179213612e-05, + "loss": 0.9531, + "step": 13050 + }, + { + "epoch": 0.75, + "grad_norm": 0.2734375, + "learning_rate": 3.596791648581546e-05, + "loss": 0.9712, + "step": 13055 + }, + { + "epoch": 0.75, + "grad_norm": 0.271484375, + "learning_rate": 3.589102539919965e-05, + "loss": 0.9869, + "step": 13060 + }, + { + "epoch": 0.75, + "grad_norm": 0.24609375, + "learning_rate": 3.5814198609405024e-05, + "loss": 0.9768, + "step": 13065 + }, + { + "epoch": 0.75, + "grad_norm": 0.2578125, + "learning_rate": 3.5737436193483555e-05, + "loss": 0.9148, + "step": 13070 + }, + { + "epoch": 0.75, + "grad_norm": 0.251953125, + "learning_rate": 3.56607382284225e-05, + "loss": 0.9513, + "step": 13075 + }, + { + "epoch": 0.75, + "grad_norm": 0.259765625, + "learning_rate": 3.5584104791144603e-05, + "loss": 0.929, + "step": 13080 + }, + { + "epoch": 0.75, + "grad_norm": 0.24609375, + "learning_rate": 3.5507535958507864e-05, + "loss": 0.8616, + "step": 13085 + }, + { + "epoch": 0.75, + "grad_norm": 0.255859375, + "learning_rate": 3.543103180730541e-05, + "loss": 0.9664, + "step": 13090 + }, + { + "epoch": 0.75, + "grad_norm": 0.248046875, + "learning_rate": 3.535459241426563e-05, + "loss": 0.8815, + "step": 13095 + }, + { + "epoch": 0.75, + "grad_norm": 0.26953125, + "learning_rate": 3.5278217856051866e-05, + "loss": 0.8886, + "step": 13100 + }, + { + "epoch": 0.75, + "grad_norm": 0.255859375, + "learning_rate": 3.5201908209262445e-05, + "loss": 0.9652, + "step": 13105 + }, + { + "epoch": 0.75, + "grad_norm": 0.25390625, + "learning_rate": 3.5125663550430585e-05, + "loss": 0.9308, + "step": 13110 + }, + { + "epoch": 0.75, + "grad_norm": 0.2431640625, + "learning_rate": 3.504948395602442e-05, + "loss": 0.8708, + "step": 13115 + }, + { + "epoch": 0.75, + "grad_norm": 0.271484375, + "learning_rate": 3.4973369502446685e-05, + "loss": 0.9243, + "step": 13120 + }, + { + "epoch": 0.75, + "grad_norm": 0.259765625, + "learning_rate": 3.4897320266034905e-05, + "loss": 0.95, + "step": 13125 + }, + { + "epoch": 0.75, + "grad_norm": 0.240234375, + "learning_rate": 3.482133632306117e-05, + "loss": 0.8964, + "step": 13130 + }, + { + "epoch": 0.75, + "grad_norm": 0.25, + "learning_rate": 3.4745417749732003e-05, + "loss": 0.9514, + "step": 13135 + }, + { + "epoch": 0.75, + "grad_norm": 0.2431640625, + "learning_rate": 3.466956462218849e-05, + "loss": 0.9212, + "step": 13140 + }, + { + "epoch": 0.75, + "grad_norm": 0.248046875, + "learning_rate": 3.4593777016505946e-05, + "loss": 0.9294, + "step": 13145 + }, + { + "epoch": 0.75, + "grad_norm": 0.251953125, + "learning_rate": 3.451805500869413e-05, + "loss": 0.9664, + "step": 13150 + }, + { + "epoch": 0.75, + "grad_norm": 0.2431640625, + "learning_rate": 3.444239867469683e-05, + "loss": 0.9726, + "step": 13155 + }, + { + "epoch": 0.76, + "grad_norm": 0.28515625, + "learning_rate": 3.4366808090392123e-05, + "loss": 0.9226, + "step": 13160 + }, + { + "epoch": 0.76, + "grad_norm": 0.2412109375, + "learning_rate": 3.429128333159208e-05, + "loss": 0.8796, + "step": 13165 + }, + { + "epoch": 0.76, + "grad_norm": 0.267578125, + "learning_rate": 3.421582447404273e-05, + "loss": 0.9727, + "step": 13170 + }, + { + "epoch": 0.76, + "grad_norm": 0.2578125, + "learning_rate": 3.414043159342408e-05, + "loss": 0.9514, + "step": 13175 + }, + { + "epoch": 0.76, + "grad_norm": 0.2578125, + "learning_rate": 3.406510476534985e-05, + "loss": 0.9228, + "step": 13180 + }, + { + "epoch": 0.76, + "grad_norm": 0.25390625, + "learning_rate": 3.398984406536765e-05, + "loss": 0.8952, + "step": 13185 + }, + { + "epoch": 0.76, + "grad_norm": 0.259765625, + "learning_rate": 3.391464956895869e-05, + "loss": 0.8966, + "step": 13190 + }, + { + "epoch": 0.76, + "grad_norm": 0.275390625, + "learning_rate": 3.3839521351537726e-05, + "loss": 0.9749, + "step": 13195 + }, + { + "epoch": 0.76, + "grad_norm": 0.263671875, + "learning_rate": 3.376445948845322e-05, + "loss": 0.9342, + "step": 13200 + }, + { + "epoch": 0.76, + "grad_norm": 0.267578125, + "learning_rate": 3.368946405498686e-05, + "loss": 0.9269, + "step": 13205 + }, + { + "epoch": 0.76, + "grad_norm": 0.248046875, + "learning_rate": 3.361453512635393e-05, + "loss": 0.9147, + "step": 13210 + }, + { + "epoch": 0.76, + "grad_norm": 0.25390625, + "learning_rate": 3.353967277770282e-05, + "loss": 0.9704, + "step": 13215 + }, + { + "epoch": 0.76, + "grad_norm": 0.265625, + "learning_rate": 3.346487708411532e-05, + "loss": 0.9207, + "step": 13220 + }, + { + "epoch": 0.76, + "grad_norm": 0.2578125, + "learning_rate": 3.3390148120606204e-05, + "loss": 0.9769, + "step": 13225 + }, + { + "epoch": 0.76, + "grad_norm": 0.2490234375, + "learning_rate": 3.331548596212347e-05, + "loss": 0.9078, + "step": 13230 + }, + { + "epoch": 0.76, + "grad_norm": 0.24609375, + "learning_rate": 3.324089068354797e-05, + "loss": 0.9256, + "step": 13235 + }, + { + "epoch": 0.76, + "grad_norm": 0.2578125, + "learning_rate": 3.3166362359693596e-05, + "loss": 0.9599, + "step": 13240 + }, + { + "epoch": 0.76, + "grad_norm": 0.263671875, + "learning_rate": 3.3091901065307084e-05, + "loss": 0.903, + "step": 13245 + }, + { + "epoch": 0.76, + "grad_norm": 0.271484375, + "learning_rate": 3.301750687506784e-05, + "loss": 0.9432, + "step": 13250 + }, + { + "epoch": 0.76, + "grad_norm": 0.26171875, + "learning_rate": 3.29431798635881e-05, + "loss": 0.9166, + "step": 13255 + }, + { + "epoch": 0.76, + "grad_norm": 0.27734375, + "learning_rate": 3.2868920105412594e-05, + "loss": 0.9601, + "step": 13260 + }, + { + "epoch": 0.76, + "grad_norm": 0.259765625, + "learning_rate": 3.279472767501876e-05, + "loss": 0.8922, + "step": 13265 + }, + { + "epoch": 0.76, + "grad_norm": 0.271484375, + "learning_rate": 3.272060264681631e-05, + "loss": 0.9163, + "step": 13270 + }, + { + "epoch": 0.76, + "grad_norm": 0.255859375, + "learning_rate": 3.264654509514757e-05, + "loss": 0.8981, + "step": 13275 + }, + { + "epoch": 0.76, + "grad_norm": 0.25390625, + "learning_rate": 3.257255509428705e-05, + "loss": 0.9022, + "step": 13280 + }, + { + "epoch": 0.76, + "grad_norm": 0.25, + "learning_rate": 3.24986327184415e-05, + "loss": 0.9099, + "step": 13285 + }, + { + "epoch": 0.76, + "grad_norm": 0.2890625, + "learning_rate": 3.2424778041749984e-05, + "loss": 0.9285, + "step": 13290 + }, + { + "epoch": 0.76, + "grad_norm": 0.255859375, + "learning_rate": 3.235099113828351e-05, + "loss": 0.9232, + "step": 13295 + }, + { + "epoch": 0.76, + "grad_norm": 0.259765625, + "learning_rate": 3.227727208204523e-05, + "loss": 0.8646, + "step": 13300 + }, + { + "epoch": 0.76, + "grad_norm": 0.2431640625, + "learning_rate": 3.2203620946970156e-05, + "loss": 0.957, + "step": 13305 + }, + { + "epoch": 0.76, + "grad_norm": 0.267578125, + "learning_rate": 3.213003780692531e-05, + "loss": 0.9429, + "step": 13310 + }, + { + "epoch": 0.76, + "grad_norm": 0.255859375, + "learning_rate": 3.2056522735709346e-05, + "loss": 0.9812, + "step": 13315 + }, + { + "epoch": 0.76, + "grad_norm": 0.2734375, + "learning_rate": 3.198307580705281e-05, + "loss": 0.9726, + "step": 13320 + }, + { + "epoch": 0.76, + "grad_norm": 0.265625, + "learning_rate": 3.190969709461783e-05, + "loss": 0.9495, + "step": 13325 + }, + { + "epoch": 0.76, + "grad_norm": 0.2490234375, + "learning_rate": 3.183638667199809e-05, + "loss": 0.9097, + "step": 13330 + }, + { + "epoch": 0.77, + "grad_norm": 0.240234375, + "learning_rate": 3.176314461271887e-05, + "loss": 1.0239, + "step": 13335 + }, + { + "epoch": 0.77, + "grad_norm": 0.259765625, + "learning_rate": 3.1689970990236784e-05, + "loss": 0.9087, + "step": 13340 + }, + { + "epoch": 0.77, + "grad_norm": 0.259765625, + "learning_rate": 3.1616865877939915e-05, + "loss": 0.9057, + "step": 13345 + }, + { + "epoch": 0.77, + "grad_norm": 0.255859375, + "learning_rate": 3.1543829349147523e-05, + "loss": 0.8985, + "step": 13350 + }, + { + "epoch": 0.77, + "grad_norm": 0.2421875, + "learning_rate": 3.147086147711022e-05, + "loss": 0.9505, + "step": 13355 + }, + { + "epoch": 0.77, + "grad_norm": 0.267578125, + "learning_rate": 3.139796233500958e-05, + "loss": 0.9484, + "step": 13360 + }, + { + "epoch": 0.77, + "grad_norm": 0.294921875, + "learning_rate": 3.132513199595846e-05, + "loss": 0.9627, + "step": 13365 + }, + { + "epoch": 0.77, + "grad_norm": 0.2490234375, + "learning_rate": 3.1252370533000494e-05, + "loss": 0.9296, + "step": 13370 + }, + { + "epoch": 0.77, + "grad_norm": 0.259765625, + "learning_rate": 3.1179678019110434e-05, + "loss": 0.9281, + "step": 13375 + }, + { + "epoch": 0.77, + "grad_norm": 0.251953125, + "learning_rate": 3.110705452719376e-05, + "loss": 0.9775, + "step": 13380 + }, + { + "epoch": 0.77, + "grad_norm": 0.259765625, + "learning_rate": 3.1034500130086706e-05, + "loss": 0.9013, + "step": 13385 + }, + { + "epoch": 0.77, + "grad_norm": 0.267578125, + "learning_rate": 3.096201490055635e-05, + "loss": 0.8999, + "step": 13390 + }, + { + "epoch": 0.77, + "grad_norm": 0.255859375, + "learning_rate": 3.088959891130022e-05, + "loss": 0.902, + "step": 13395 + }, + { + "epoch": 0.77, + "grad_norm": 0.251953125, + "learning_rate": 3.081725223494656e-05, + "loss": 0.9296, + "step": 13400 + }, + { + "epoch": 0.77, + "grad_norm": 0.2470703125, + "learning_rate": 3.074497494405404e-05, + "loss": 0.944, + "step": 13405 + }, + { + "epoch": 0.77, + "grad_norm": 0.2451171875, + "learning_rate": 3.0672767111111666e-05, + "loss": 0.9, + "step": 13410 + }, + { + "epoch": 0.77, + "grad_norm": 0.259765625, + "learning_rate": 3.0600628808538915e-05, + "loss": 0.9412, + "step": 13415 + }, + { + "epoch": 0.77, + "grad_norm": 0.2421875, + "learning_rate": 3.05285601086854e-05, + "loss": 0.9297, + "step": 13420 + }, + { + "epoch": 0.77, + "grad_norm": 0.25, + "learning_rate": 3.045656108383106e-05, + "loss": 0.9706, + "step": 13425 + }, + { + "epoch": 0.77, + "grad_norm": 0.240234375, + "learning_rate": 3.0384631806185815e-05, + "loss": 0.9361, + "step": 13430 + }, + { + "epoch": 0.77, + "grad_norm": 0.263671875, + "learning_rate": 3.0312772347889773e-05, + "loss": 0.9369, + "step": 13435 + }, + { + "epoch": 0.77, + "grad_norm": 0.263671875, + "learning_rate": 3.0240982781012873e-05, + "loss": 0.9819, + "step": 13440 + }, + { + "epoch": 0.77, + "grad_norm": 0.265625, + "learning_rate": 3.0169263177555085e-05, + "loss": 0.9965, + "step": 13445 + }, + { + "epoch": 0.77, + "grad_norm": 0.267578125, + "learning_rate": 3.0097613609446172e-05, + "loss": 0.9073, + "step": 13450 + }, + { + "epoch": 0.77, + "grad_norm": 0.26953125, + "learning_rate": 3.002603414854559e-05, + "loss": 1.0237, + "step": 13455 + }, + { + "epoch": 0.77, + "grad_norm": 0.259765625, + "learning_rate": 2.9954524866642585e-05, + "loss": 0.9512, + "step": 13460 + }, + { + "epoch": 0.77, + "grad_norm": 0.2470703125, + "learning_rate": 2.988308583545596e-05, + "loss": 0.9565, + "step": 13465 + }, + { + "epoch": 0.77, + "grad_norm": 0.263671875, + "learning_rate": 2.9811717126634066e-05, + "loss": 0.9656, + "step": 13470 + }, + { + "epoch": 0.77, + "grad_norm": 0.271484375, + "learning_rate": 2.974041881175468e-05, + "loss": 0.9785, + "step": 13475 + }, + { + "epoch": 0.77, + "grad_norm": 0.2734375, + "learning_rate": 2.9669190962325112e-05, + "loss": 0.9613, + "step": 13480 + }, + { + "epoch": 0.77, + "grad_norm": 0.2578125, + "learning_rate": 2.959803364978184e-05, + "loss": 0.9328, + "step": 13485 + }, + { + "epoch": 0.77, + "grad_norm": 0.265625, + "learning_rate": 2.952694694549073e-05, + "loss": 0.9668, + "step": 13490 + }, + { + "epoch": 0.77, + "grad_norm": 0.2470703125, + "learning_rate": 2.9455930920746778e-05, + "loss": 0.8782, + "step": 13495 + }, + { + "epoch": 0.77, + "grad_norm": 0.265625, + "learning_rate": 2.9384985646774053e-05, + "loss": 1.0128, + "step": 13500 + }, + { + "epoch": 0.77, + "grad_norm": 0.27734375, + "learning_rate": 2.9314111194725757e-05, + "loss": 0.9231, + "step": 13505 + }, + { + "epoch": 0.78, + "grad_norm": 0.24609375, + "learning_rate": 2.9243307635683957e-05, + "loss": 0.9255, + "step": 13510 + }, + { + "epoch": 0.78, + "grad_norm": 0.25390625, + "learning_rate": 2.9172575040659744e-05, + "loss": 0.9449, + "step": 13515 + }, + { + "epoch": 0.78, + "grad_norm": 0.259765625, + "learning_rate": 2.910191348059289e-05, + "loss": 0.8508, + "step": 13520 + }, + { + "epoch": 0.78, + "grad_norm": 0.248046875, + "learning_rate": 2.9031323026352053e-05, + "loss": 0.9159, + "step": 13525 + }, + { + "epoch": 0.78, + "grad_norm": 0.248046875, + "learning_rate": 2.8960803748734534e-05, + "loss": 0.9562, + "step": 13530 + }, + { + "epoch": 0.78, + "grad_norm": 0.271484375, + "learning_rate": 2.8890355718466177e-05, + "loss": 0.8926, + "step": 13535 + }, + { + "epoch": 0.78, + "grad_norm": 0.267578125, + "learning_rate": 2.8819979006201526e-05, + "loss": 0.9727, + "step": 13540 + }, + { + "epoch": 0.78, + "grad_norm": 0.25390625, + "learning_rate": 2.8749673682523404e-05, + "loss": 0.954, + "step": 13545 + }, + { + "epoch": 0.78, + "grad_norm": 0.2578125, + "learning_rate": 2.8679439817943232e-05, + "loss": 0.8983, + "step": 13550 + }, + { + "epoch": 0.78, + "grad_norm": 0.248046875, + "learning_rate": 2.860927748290061e-05, + "loss": 0.9354, + "step": 13555 + }, + { + "epoch": 0.78, + "grad_norm": 0.26171875, + "learning_rate": 2.853918674776345e-05, + "loss": 0.9717, + "step": 13560 + }, + { + "epoch": 0.78, + "grad_norm": 0.2578125, + "learning_rate": 2.84691676828279e-05, + "loss": 0.9196, + "step": 13565 + }, + { + "epoch": 0.78, + "grad_norm": 0.251953125, + "learning_rate": 2.8399220358318148e-05, + "loss": 0.956, + "step": 13570 + }, + { + "epoch": 0.78, + "grad_norm": 0.259765625, + "learning_rate": 2.832934484438652e-05, + "loss": 1.0523, + "step": 13575 + }, + { + "epoch": 0.78, + "grad_norm": 0.25390625, + "learning_rate": 2.8259541211113216e-05, + "loss": 1.0049, + "step": 13580 + }, + { + "epoch": 0.78, + "grad_norm": 0.51953125, + "learning_rate": 2.8189809528506462e-05, + "loss": 0.9359, + "step": 13585 + }, + { + "epoch": 0.78, + "grad_norm": 0.263671875, + "learning_rate": 2.81201498665022e-05, + "loss": 0.8533, + "step": 13590 + }, + { + "epoch": 0.78, + "grad_norm": 0.251953125, + "learning_rate": 2.8050562294964267e-05, + "loss": 0.8861, + "step": 13595 + }, + { + "epoch": 0.78, + "grad_norm": 0.26171875, + "learning_rate": 2.798104688368407e-05, + "loss": 0.9037, + "step": 13600 + }, + { + "epoch": 0.78, + "grad_norm": 0.25, + "learning_rate": 2.791160370238075e-05, + "loss": 0.9423, + "step": 13605 + }, + { + "epoch": 0.78, + "grad_norm": 0.23828125, + "learning_rate": 2.7842232820700977e-05, + "loss": 0.9029, + "step": 13610 + }, + { + "epoch": 0.78, + "grad_norm": 0.294921875, + "learning_rate": 2.7772934308218846e-05, + "loss": 0.9684, + "step": 13615 + }, + { + "epoch": 0.78, + "grad_norm": 0.265625, + "learning_rate": 2.7703708234435988e-05, + "loss": 0.9452, + "step": 13620 + }, + { + "epoch": 0.78, + "grad_norm": 0.27734375, + "learning_rate": 2.7634554668781242e-05, + "loss": 0.9624, + "step": 13625 + }, + { + "epoch": 0.78, + "grad_norm": 0.2890625, + "learning_rate": 2.7565473680610887e-05, + "loss": 0.9741, + "step": 13630 + }, + { + "epoch": 0.78, + "grad_norm": 0.25390625, + "learning_rate": 2.7496465339208233e-05, + "loss": 0.9307, + "step": 13635 + }, + { + "epoch": 0.78, + "grad_norm": 0.263671875, + "learning_rate": 2.7427529713783905e-05, + "loss": 0.9435, + "step": 13640 + }, + { + "epoch": 0.78, + "grad_norm": 0.26171875, + "learning_rate": 2.7358666873475493e-05, + "loss": 0.8875, + "step": 13645 + }, + { + "epoch": 0.78, + "grad_norm": 0.251953125, + "learning_rate": 2.7289876887347554e-05, + "loss": 0.9351, + "step": 13650 + }, + { + "epoch": 0.78, + "grad_norm": 0.263671875, + "learning_rate": 2.722115982439173e-05, + "loss": 0.928, + "step": 13655 + }, + { + "epoch": 0.78, + "grad_norm": 0.248046875, + "learning_rate": 2.7152515753526364e-05, + "loss": 0.8899, + "step": 13660 + }, + { + "epoch": 0.78, + "grad_norm": 0.26171875, + "learning_rate": 2.70839447435967e-05, + "loss": 1.0374, + "step": 13665 + }, + { + "epoch": 0.78, + "grad_norm": 0.28515625, + "learning_rate": 2.7015446863374637e-05, + "loss": 0.984, + "step": 13670 + }, + { + "epoch": 0.78, + "grad_norm": 0.26171875, + "learning_rate": 2.6947022181558813e-05, + "loss": 0.8674, + "step": 13675 + }, + { + "epoch": 0.78, + "grad_norm": 0.255859375, + "learning_rate": 2.6878670766774328e-05, + "loss": 0.9809, + "step": 13680 + }, + { + "epoch": 0.79, + "grad_norm": 0.259765625, + "learning_rate": 2.6810392687572928e-05, + "loss": 0.9736, + "step": 13685 + }, + { + "epoch": 0.79, + "grad_norm": 0.24609375, + "learning_rate": 2.6742188012432767e-05, + "loss": 0.9327, + "step": 13690 + }, + { + "epoch": 0.79, + "grad_norm": 0.251953125, + "learning_rate": 2.667405680975831e-05, + "loss": 0.923, + "step": 13695 + }, + { + "epoch": 0.79, + "grad_norm": 0.265625, + "learning_rate": 2.6605999147880456e-05, + "loss": 1.0264, + "step": 13700 + }, + { + "epoch": 0.79, + "grad_norm": 0.26171875, + "learning_rate": 2.6538015095056223e-05, + "loss": 0.9748, + "step": 13705 + }, + { + "epoch": 0.79, + "grad_norm": 0.271484375, + "learning_rate": 2.6470104719468925e-05, + "loss": 0.8896, + "step": 13710 + }, + { + "epoch": 0.79, + "grad_norm": 0.26171875, + "learning_rate": 2.6402268089227866e-05, + "loss": 0.9706, + "step": 13715 + }, + { + "epoch": 0.79, + "grad_norm": 0.24609375, + "learning_rate": 2.6334505272368493e-05, + "loss": 0.968, + "step": 13720 + }, + { + "epoch": 0.79, + "grad_norm": 0.2451171875, + "learning_rate": 2.626681633685213e-05, + "loss": 0.9703, + "step": 13725 + }, + { + "epoch": 0.79, + "grad_norm": 0.263671875, + "learning_rate": 2.6199201350566104e-05, + "loss": 0.9082, + "step": 13730 + }, + { + "epoch": 0.79, + "grad_norm": 0.267578125, + "learning_rate": 2.613166038132345e-05, + "loss": 0.9308, + "step": 13735 + }, + { + "epoch": 0.79, + "grad_norm": 0.2490234375, + "learning_rate": 2.606419349686312e-05, + "loss": 0.9935, + "step": 13740 + }, + { + "epoch": 0.79, + "grad_norm": 0.2373046875, + "learning_rate": 2.5996800764849638e-05, + "loss": 0.8884, + "step": 13745 + }, + { + "epoch": 0.79, + "grad_norm": 0.26171875, + "learning_rate": 2.5929482252873183e-05, + "loss": 0.9865, + "step": 13750 + }, + { + "epoch": 0.79, + "grad_norm": 0.23828125, + "learning_rate": 2.5862238028449582e-05, + "loss": 0.9271, + "step": 13755 + }, + { + "epoch": 0.79, + "grad_norm": 0.25, + "learning_rate": 2.579506815902002e-05, + "loss": 0.9695, + "step": 13760 + }, + { + "epoch": 0.79, + "grad_norm": 0.2578125, + "learning_rate": 2.5727972711951208e-05, + "loss": 0.9152, + "step": 13765 + }, + { + "epoch": 0.79, + "grad_norm": 0.263671875, + "learning_rate": 2.5660951754535245e-05, + "loss": 0.914, + "step": 13770 + }, + { + "epoch": 0.79, + "grad_norm": 0.255859375, + "learning_rate": 2.559400535398938e-05, + "loss": 0.9339, + "step": 13775 + }, + { + "epoch": 0.79, + "grad_norm": 0.251953125, + "learning_rate": 2.5527133577456254e-05, + "loss": 1.0445, + "step": 13780 + }, + { + "epoch": 0.79, + "grad_norm": 0.25, + "learning_rate": 2.5460336492003522e-05, + "loss": 0.921, + "step": 13785 + }, + { + "epoch": 0.79, + "grad_norm": 0.26953125, + "learning_rate": 2.5393614164624047e-05, + "loss": 0.9167, + "step": 13790 + }, + { + "epoch": 0.79, + "grad_norm": 0.25, + "learning_rate": 2.5326966662235597e-05, + "loss": 0.9875, + "step": 13795 + }, + { + "epoch": 0.79, + "grad_norm": 0.2578125, + "learning_rate": 2.5260394051681024e-05, + "loss": 1.0246, + "step": 13800 + }, + { + "epoch": 0.79, + "grad_norm": 0.26171875, + "learning_rate": 2.5193896399727945e-05, + "loss": 0.9762, + "step": 13805 + }, + { + "epoch": 0.79, + "grad_norm": 0.2578125, + "learning_rate": 2.5127473773068888e-05, + "loss": 0.9957, + "step": 13810 + }, + { + "epoch": 0.79, + "grad_norm": 0.259765625, + "learning_rate": 2.506112623832113e-05, + "loss": 0.9724, + "step": 13815 + }, + { + "epoch": 0.79, + "grad_norm": 0.26171875, + "learning_rate": 2.499485386202659e-05, + "loss": 0.9355, + "step": 13820 + }, + { + "epoch": 0.79, + "grad_norm": 0.265625, + "learning_rate": 2.49286567106518e-05, + "loss": 0.9687, + "step": 13825 + }, + { + "epoch": 0.79, + "grad_norm": 0.2578125, + "learning_rate": 2.4862534850587925e-05, + "loss": 0.9089, + "step": 13830 + }, + { + "epoch": 0.79, + "grad_norm": 0.2451171875, + "learning_rate": 2.4796488348150548e-05, + "loss": 0.8917, + "step": 13835 + }, + { + "epoch": 0.79, + "grad_norm": 0.251953125, + "learning_rate": 2.4730517269579667e-05, + "loss": 0.9085, + "step": 13840 + }, + { + "epoch": 0.79, + "grad_norm": 0.2734375, + "learning_rate": 2.4664621681039723e-05, + "loss": 0.9635, + "step": 13845 + }, + { + "epoch": 0.79, + "grad_norm": 0.251953125, + "learning_rate": 2.459880164861932e-05, + "loss": 1.0015, + "step": 13850 + }, + { + "epoch": 0.79, + "grad_norm": 0.271484375, + "learning_rate": 2.453305723833139e-05, + "loss": 0.9528, + "step": 13855 + }, + { + "epoch": 0.8, + "grad_norm": 0.255859375, + "learning_rate": 2.4467388516113e-05, + "loss": 0.9126, + "step": 13860 + }, + { + "epoch": 0.8, + "grad_norm": 0.248046875, + "learning_rate": 2.4401795547825234e-05, + "loss": 0.91, + "step": 13865 + }, + { + "epoch": 0.8, + "grad_norm": 0.26953125, + "learning_rate": 2.433627839925332e-05, + "loss": 0.9091, + "step": 13870 + }, + { + "epoch": 0.8, + "grad_norm": 0.2578125, + "learning_rate": 2.427083713610632e-05, + "loss": 0.9037, + "step": 13875 + }, + { + "epoch": 0.8, + "grad_norm": 0.255859375, + "learning_rate": 2.42054718240173e-05, + "loss": 0.9868, + "step": 13880 + }, + { + "epoch": 0.8, + "grad_norm": 0.25390625, + "learning_rate": 2.4140182528543044e-05, + "loss": 0.9237, + "step": 13885 + }, + { + "epoch": 0.8, + "grad_norm": 0.265625, + "learning_rate": 2.4074969315164176e-05, + "loss": 0.9442, + "step": 13890 + }, + { + "epoch": 0.8, + "grad_norm": 0.251953125, + "learning_rate": 2.4009832249285035e-05, + "loss": 0.9621, + "step": 13895 + }, + { + "epoch": 0.8, + "grad_norm": 0.28515625, + "learning_rate": 2.3944771396233467e-05, + "loss": 1.0088, + "step": 13900 + }, + { + "epoch": 0.8, + "grad_norm": 0.2490234375, + "learning_rate": 2.387978682126104e-05, + "loss": 0.955, + "step": 13905 + }, + { + "epoch": 0.8, + "grad_norm": 0.255859375, + "learning_rate": 2.3814878589542678e-05, + "loss": 0.9292, + "step": 13910 + }, + { + "epoch": 0.8, + "grad_norm": 0.2451171875, + "learning_rate": 2.3750046766176846e-05, + "loss": 0.9506, + "step": 13915 + }, + { + "epoch": 0.8, + "grad_norm": 0.240234375, + "learning_rate": 2.368529141618533e-05, + "loss": 0.9709, + "step": 13920 + }, + { + "epoch": 0.8, + "grad_norm": 0.255859375, + "learning_rate": 2.362061260451316e-05, + "loss": 0.9349, + "step": 13925 + }, + { + "epoch": 0.8, + "grad_norm": 0.2734375, + "learning_rate": 2.3556010396028737e-05, + "loss": 0.9998, + "step": 13930 + }, + { + "epoch": 0.8, + "grad_norm": 0.26171875, + "learning_rate": 2.3491484855523504e-05, + "loss": 0.9845, + "step": 13935 + }, + { + "epoch": 0.8, + "grad_norm": 0.263671875, + "learning_rate": 2.3427036047712125e-05, + "loss": 0.9669, + "step": 13940 + }, + { + "epoch": 0.8, + "grad_norm": 0.26171875, + "learning_rate": 2.3362664037232184e-05, + "loss": 0.9731, + "step": 13945 + }, + { + "epoch": 0.8, + "grad_norm": 0.275390625, + "learning_rate": 2.3298368888644386e-05, + "loss": 0.9392, + "step": 13950 + }, + { + "epoch": 0.8, + "grad_norm": 0.248046875, + "learning_rate": 2.323415066643221e-05, + "loss": 0.9111, + "step": 13955 + }, + { + "epoch": 0.8, + "grad_norm": 0.2470703125, + "learning_rate": 2.3170009435002083e-05, + "loss": 0.9801, + "step": 13960 + }, + { + "epoch": 0.8, + "grad_norm": 0.251953125, + "learning_rate": 2.3105945258683125e-05, + "loss": 0.9287, + "step": 13965 + }, + { + "epoch": 0.8, + "grad_norm": 0.248046875, + "learning_rate": 2.3041958201727275e-05, + "loss": 0.9081, + "step": 13970 + }, + { + "epoch": 0.8, + "grad_norm": 0.24609375, + "learning_rate": 2.29780483283091e-05, + "loss": 0.856, + "step": 13975 + }, + { + "epoch": 0.8, + "grad_norm": 0.298828125, + "learning_rate": 2.2914215702525677e-05, + "loss": 0.9447, + "step": 13980 + }, + { + "epoch": 0.8, + "grad_norm": 0.287109375, + "learning_rate": 2.285046038839672e-05, + "loss": 1.0183, + "step": 13985 + }, + { + "epoch": 0.8, + "grad_norm": 0.2578125, + "learning_rate": 2.2786782449864298e-05, + "loss": 0.9925, + "step": 13990 + }, + { + "epoch": 0.8, + "grad_norm": 0.26953125, + "learning_rate": 2.2723181950793004e-05, + "loss": 0.9696, + "step": 13995 + }, + { + "epoch": 0.8, + "grad_norm": 0.27734375, + "learning_rate": 2.265965895496962e-05, + "loss": 0.9292, + "step": 14000 + }, + { + "epoch": 0.8, + "grad_norm": 0.25, + "learning_rate": 2.259621352610333e-05, + "loss": 0.9397, + "step": 14005 + }, + { + "epoch": 0.8, + "grad_norm": 0.259765625, + "learning_rate": 2.2532845727825445e-05, + "loss": 0.89, + "step": 14010 + }, + { + "epoch": 0.8, + "grad_norm": 0.2412109375, + "learning_rate": 2.246955562368941e-05, + "loss": 0.954, + "step": 14015 + }, + { + "epoch": 0.8, + "grad_norm": 0.263671875, + "learning_rate": 2.2406343277170827e-05, + "loss": 0.9743, + "step": 14020 + }, + { + "epoch": 0.8, + "grad_norm": 0.2421875, + "learning_rate": 2.234320875166721e-05, + "loss": 0.9589, + "step": 14025 + }, + { + "epoch": 0.81, + "grad_norm": 0.2451171875, + "learning_rate": 2.228015211049813e-05, + "loss": 0.9826, + "step": 14030 + }, + { + "epoch": 0.81, + "grad_norm": 0.26171875, + "learning_rate": 2.2217173416904935e-05, + "loss": 0.9069, + "step": 14035 + }, + { + "epoch": 0.81, + "grad_norm": 0.2470703125, + "learning_rate": 2.2154272734050908e-05, + "loss": 0.9242, + "step": 14040 + }, + { + "epoch": 0.81, + "grad_norm": 0.29296875, + "learning_rate": 2.2091450125020986e-05, + "loss": 0.9874, + "step": 14045 + }, + { + "epoch": 0.81, + "grad_norm": 0.271484375, + "learning_rate": 2.202870565282188e-05, + "loss": 0.9641, + "step": 14050 + }, + { + "epoch": 0.81, + "grad_norm": 0.26171875, + "learning_rate": 2.1966039380381944e-05, + "loss": 0.9674, + "step": 14055 + }, + { + "epoch": 0.81, + "grad_norm": 0.30078125, + "learning_rate": 2.1903451370551e-05, + "loss": 0.9647, + "step": 14060 + }, + { + "epoch": 0.81, + "grad_norm": 0.2392578125, + "learning_rate": 2.1840941686100524e-05, + "loss": 0.9176, + "step": 14065 + }, + { + "epoch": 0.81, + "grad_norm": 0.26171875, + "learning_rate": 2.1778510389723283e-05, + "loss": 0.923, + "step": 14070 + }, + { + "epoch": 0.81, + "grad_norm": 0.263671875, + "learning_rate": 2.1716157544033578e-05, + "loss": 0.8969, + "step": 14075 + }, + { + "epoch": 0.81, + "grad_norm": 0.2451171875, + "learning_rate": 2.1653883211566895e-05, + "loss": 0.9502, + "step": 14080 + }, + { + "epoch": 0.81, + "grad_norm": 0.251953125, + "learning_rate": 2.1591687454780087e-05, + "loss": 0.985, + "step": 14085 + }, + { + "epoch": 0.81, + "grad_norm": 0.251953125, + "learning_rate": 2.152957033605111e-05, + "loss": 0.9258, + "step": 14090 + }, + { + "epoch": 0.81, + "grad_norm": 0.30078125, + "learning_rate": 2.146753191767913e-05, + "loss": 0.8852, + "step": 14095 + }, + { + "epoch": 0.81, + "grad_norm": 0.265625, + "learning_rate": 2.14055722618843e-05, + "loss": 0.8649, + "step": 14100 + }, + { + "epoch": 0.81, + "grad_norm": 0.2734375, + "learning_rate": 2.1343691430807878e-05, + "loss": 0.9535, + "step": 14105 + }, + { + "epoch": 0.81, + "grad_norm": 0.275390625, + "learning_rate": 2.1281889486511984e-05, + "loss": 1.0796, + "step": 14110 + }, + { + "epoch": 0.81, + "grad_norm": 0.2470703125, + "learning_rate": 2.122016649097962e-05, + "loss": 0.937, + "step": 14115 + }, + { + "epoch": 0.81, + "grad_norm": 0.2578125, + "learning_rate": 2.1158522506114696e-05, + "loss": 0.866, + "step": 14120 + }, + { + "epoch": 0.81, + "grad_norm": 0.251953125, + "learning_rate": 2.1096957593741772e-05, + "loss": 1.0118, + "step": 14125 + }, + { + "epoch": 0.81, + "grad_norm": 0.265625, + "learning_rate": 2.1035471815606178e-05, + "loss": 0.916, + "step": 14130 + }, + { + "epoch": 0.81, + "grad_norm": 0.2578125, + "learning_rate": 2.0974065233373864e-05, + "loss": 0.9609, + "step": 14135 + }, + { + "epoch": 0.81, + "grad_norm": 0.275390625, + "learning_rate": 2.0912737908631296e-05, + "loss": 0.9456, + "step": 14140 + }, + { + "epoch": 0.81, + "grad_norm": 0.30078125, + "learning_rate": 2.0851489902885556e-05, + "loss": 0.9138, + "step": 14145 + }, + { + "epoch": 0.81, + "grad_norm": 0.2578125, + "learning_rate": 2.079032127756405e-05, + "loss": 0.9634, + "step": 14150 + }, + { + "epoch": 0.81, + "grad_norm": 0.244140625, + "learning_rate": 2.0729232094014693e-05, + "loss": 0.9542, + "step": 14155 + }, + { + "epoch": 0.81, + "grad_norm": 0.26953125, + "learning_rate": 2.066822241350561e-05, + "loss": 0.9578, + "step": 14160 + }, + { + "epoch": 0.81, + "grad_norm": 0.3359375, + "learning_rate": 2.06072922972253e-05, + "loss": 0.9623, + "step": 14165 + }, + { + "epoch": 0.81, + "grad_norm": 0.2490234375, + "learning_rate": 2.0546441806282367e-05, + "loss": 0.9994, + "step": 14170 + }, + { + "epoch": 0.81, + "grad_norm": 0.255859375, + "learning_rate": 2.048567100170562e-05, + "loss": 0.9908, + "step": 14175 + }, + { + "epoch": 0.81, + "grad_norm": 0.263671875, + "learning_rate": 2.0424979944443946e-05, + "loss": 0.9765, + "step": 14180 + }, + { + "epoch": 0.81, + "grad_norm": 0.271484375, + "learning_rate": 2.0364368695366233e-05, + "loss": 1.0086, + "step": 14185 + }, + { + "epoch": 0.81, + "grad_norm": 0.2470703125, + "learning_rate": 2.0303837315261277e-05, + "loss": 0.9459, + "step": 14190 + }, + { + "epoch": 0.81, + "grad_norm": 0.29296875, + "learning_rate": 2.0243385864837884e-05, + "loss": 0.9684, + "step": 14195 + }, + { + "epoch": 0.81, + "grad_norm": 0.25, + "learning_rate": 2.018301440472461e-05, + "loss": 1.0211, + "step": 14200 + }, + { + "epoch": 0.82, + "grad_norm": 0.267578125, + "learning_rate": 2.012272299546978e-05, + "loss": 0.9033, + "step": 14205 + }, + { + "epoch": 0.82, + "grad_norm": 0.2470703125, + "learning_rate": 2.0062511697541485e-05, + "loss": 0.8911, + "step": 14210 + }, + { + "epoch": 0.82, + "grad_norm": 0.259765625, + "learning_rate": 2.0002380571327482e-05, + "loss": 0.9106, + "step": 14215 + }, + { + "epoch": 0.82, + "grad_norm": 0.263671875, + "learning_rate": 1.994232967713505e-05, + "loss": 0.8815, + "step": 14220 + }, + { + "epoch": 0.82, + "grad_norm": 0.267578125, + "learning_rate": 1.988235907519107e-05, + "loss": 0.9802, + "step": 14225 + }, + { + "epoch": 0.82, + "grad_norm": 0.2578125, + "learning_rate": 1.9822468825641816e-05, + "loss": 0.9102, + "step": 14230 + }, + { + "epoch": 0.82, + "grad_norm": 0.25, + "learning_rate": 1.9762658988553086e-05, + "loss": 0.9642, + "step": 14235 + }, + { + "epoch": 0.82, + "grad_norm": 0.26171875, + "learning_rate": 1.9702929623909903e-05, + "loss": 0.9312, + "step": 14240 + }, + { + "epoch": 0.82, + "grad_norm": 0.26171875, + "learning_rate": 1.9643280791616702e-05, + "loss": 0.9633, + "step": 14245 + }, + { + "epoch": 0.82, + "grad_norm": 0.259765625, + "learning_rate": 1.9583712551497036e-05, + "loss": 0.9299, + "step": 14250 + }, + { + "epoch": 0.82, + "grad_norm": 0.2578125, + "learning_rate": 1.9524224963293726e-05, + "loss": 0.9632, + "step": 14255 + }, + { + "epoch": 0.82, + "grad_norm": 0.2470703125, + "learning_rate": 1.946481808666868e-05, + "loss": 0.9104, + "step": 14260 + }, + { + "epoch": 0.82, + "grad_norm": 0.2373046875, + "learning_rate": 1.9405491981202794e-05, + "loss": 0.9729, + "step": 14265 + }, + { + "epoch": 0.82, + "grad_norm": 0.26171875, + "learning_rate": 1.9346246706396066e-05, + "loss": 1.0376, + "step": 14270 + }, + { + "epoch": 0.82, + "grad_norm": 0.2578125, + "learning_rate": 1.9287082321667295e-05, + "loss": 0.892, + "step": 14275 + }, + { + "epoch": 0.82, + "grad_norm": 0.2490234375, + "learning_rate": 1.9227998886354305e-05, + "loss": 0.9092, + "step": 14280 + }, + { + "epoch": 0.82, + "grad_norm": 0.259765625, + "learning_rate": 1.9168996459713594e-05, + "loss": 0.9831, + "step": 14285 + }, + { + "epoch": 0.82, + "grad_norm": 0.2578125, + "learning_rate": 1.9110075100920466e-05, + "loss": 0.9946, + "step": 14290 + }, + { + "epoch": 0.82, + "grad_norm": 0.2490234375, + "learning_rate": 1.9051234869068978e-05, + "loss": 0.9851, + "step": 14295 + }, + { + "epoch": 0.82, + "grad_norm": 0.2578125, + "learning_rate": 1.8992475823171717e-05, + "loss": 0.9481, + "step": 14300 + }, + { + "epoch": 0.82, + "grad_norm": 0.259765625, + "learning_rate": 1.8933798022159943e-05, + "loss": 0.9748, + "step": 14305 + }, + { + "epoch": 0.82, + "grad_norm": 0.26171875, + "learning_rate": 1.8875201524883347e-05, + "loss": 0.9562, + "step": 14310 + }, + { + "epoch": 0.82, + "grad_norm": 0.26171875, + "learning_rate": 1.8816686390110172e-05, + "loss": 0.9724, + "step": 14315 + }, + { + "epoch": 0.82, + "grad_norm": 0.2373046875, + "learning_rate": 1.8758252676526945e-05, + "loss": 0.8649, + "step": 14320 + }, + { + "epoch": 0.82, + "grad_norm": 0.29296875, + "learning_rate": 1.869990044273867e-05, + "loss": 0.9095, + "step": 14325 + }, + { + "epoch": 0.82, + "grad_norm": 0.2412109375, + "learning_rate": 1.8641629747268486e-05, + "loss": 0.8604, + "step": 14330 + }, + { + "epoch": 0.82, + "grad_norm": 0.259765625, + "learning_rate": 1.858344064855787e-05, + "loss": 0.9063, + "step": 14335 + }, + { + "epoch": 0.82, + "grad_norm": 0.25, + "learning_rate": 1.8525333204966443e-05, + "loss": 0.9234, + "step": 14340 + }, + { + "epoch": 0.82, + "grad_norm": 0.265625, + "learning_rate": 1.8467307474771855e-05, + "loss": 0.9857, + "step": 14345 + }, + { + "epoch": 0.82, + "grad_norm": 0.25390625, + "learning_rate": 1.8409363516169907e-05, + "loss": 0.9724, + "step": 14350 + }, + { + "epoch": 0.82, + "grad_norm": 0.2451171875, + "learning_rate": 1.8351501387274307e-05, + "loss": 0.9585, + "step": 14355 + }, + { + "epoch": 0.82, + "grad_norm": 0.265625, + "learning_rate": 1.829372114611675e-05, + "loss": 0.8978, + "step": 14360 + }, + { + "epoch": 0.82, + "grad_norm": 0.26171875, + "learning_rate": 1.823602285064675e-05, + "loss": 0.9764, + "step": 14365 + }, + { + "epoch": 0.82, + "grad_norm": 0.2451171875, + "learning_rate": 1.817840655873171e-05, + "loss": 0.8856, + "step": 14370 + }, + { + "epoch": 0.82, + "grad_norm": 0.2392578125, + "learning_rate": 1.8120872328156713e-05, + "loss": 0.9027, + "step": 14375 + }, + { + "epoch": 0.83, + "grad_norm": 0.251953125, + "learning_rate": 1.8063420216624548e-05, + "loss": 0.845, + "step": 14380 + }, + { + "epoch": 0.83, + "grad_norm": 0.259765625, + "learning_rate": 1.8006050281755725e-05, + "loss": 0.8853, + "step": 14385 + }, + { + "epoch": 0.83, + "grad_norm": 0.259765625, + "learning_rate": 1.794876258108823e-05, + "loss": 0.8895, + "step": 14390 + }, + { + "epoch": 0.83, + "grad_norm": 0.26953125, + "learning_rate": 1.7891557172077666e-05, + "loss": 1.0076, + "step": 14395 + }, + { + "epoch": 0.83, + "grad_norm": 0.25, + "learning_rate": 1.783443411209701e-05, + "loss": 0.9455, + "step": 14400 + }, + { + "epoch": 0.83, + "grad_norm": 0.2490234375, + "learning_rate": 1.7777393458436753e-05, + "loss": 0.915, + "step": 14405 + }, + { + "epoch": 0.83, + "grad_norm": 0.28125, + "learning_rate": 1.7720435268304625e-05, + "loss": 1.0042, + "step": 14410 + }, + { + "epoch": 0.83, + "grad_norm": 0.267578125, + "learning_rate": 1.766355959882575e-05, + "loss": 0.9392, + "step": 14415 + }, + { + "epoch": 0.83, + "grad_norm": 0.291015625, + "learning_rate": 1.7606766507042473e-05, + "loss": 0.9593, + "step": 14420 + }, + { + "epoch": 0.83, + "grad_norm": 0.248046875, + "learning_rate": 1.755005604991423e-05, + "loss": 0.9684, + "step": 14425 + }, + { + "epoch": 0.83, + "grad_norm": 0.2490234375, + "learning_rate": 1.7493428284317716e-05, + "loss": 0.885, + "step": 14430 + }, + { + "epoch": 0.83, + "grad_norm": 0.25390625, + "learning_rate": 1.7436883267046567e-05, + "loss": 0.9329, + "step": 14435 + }, + { + "epoch": 0.83, + "grad_norm": 0.2734375, + "learning_rate": 1.7380421054811524e-05, + "loss": 1.0126, + "step": 14440 + }, + { + "epoch": 0.83, + "grad_norm": 0.2412109375, + "learning_rate": 1.732404170424019e-05, + "loss": 0.8792, + "step": 14445 + }, + { + "epoch": 0.83, + "grad_norm": 0.25390625, + "learning_rate": 1.7267745271877177e-05, + "loss": 0.9785, + "step": 14450 + }, + { + "epoch": 0.83, + "grad_norm": 0.255859375, + "learning_rate": 1.7211531814183825e-05, + "loss": 0.8623, + "step": 14455 + }, + { + "epoch": 0.83, + "grad_norm": 0.255859375, + "learning_rate": 1.7155401387538327e-05, + "loss": 0.9199, + "step": 14460 + }, + { + "epoch": 0.83, + "grad_norm": 0.25, + "learning_rate": 1.709935404823555e-05, + "loss": 0.8532, + "step": 14465 + }, + { + "epoch": 0.83, + "grad_norm": 0.2578125, + "learning_rate": 1.70433898524871e-05, + "loss": 0.9295, + "step": 14470 + }, + { + "epoch": 0.83, + "grad_norm": 0.279296875, + "learning_rate": 1.698750885642114e-05, + "loss": 0.9423, + "step": 14475 + }, + { + "epoch": 0.83, + "grad_norm": 0.251953125, + "learning_rate": 1.693171111608237e-05, + "loss": 0.9283, + "step": 14480 + }, + { + "epoch": 0.83, + "grad_norm": 0.25390625, + "learning_rate": 1.6875996687432073e-05, + "loss": 0.9966, + "step": 14485 + }, + { + "epoch": 0.83, + "grad_norm": 0.263671875, + "learning_rate": 1.6820365626347878e-05, + "loss": 0.9173, + "step": 14490 + }, + { + "epoch": 0.83, + "grad_norm": 0.267578125, + "learning_rate": 1.676481798862387e-05, + "loss": 0.8821, + "step": 14495 + }, + { + "epoch": 0.83, + "grad_norm": 0.283203125, + "learning_rate": 1.6709353829970496e-05, + "loss": 0.9018, + "step": 14500 + }, + { + "epoch": 0.83, + "grad_norm": 0.26171875, + "learning_rate": 1.665397320601436e-05, + "loss": 0.8973, + "step": 14505 + }, + { + "epoch": 0.83, + "grad_norm": 0.25, + "learning_rate": 1.659867617229841e-05, + "loss": 0.8835, + "step": 14510 + }, + { + "epoch": 0.83, + "grad_norm": 0.255859375, + "learning_rate": 1.654346278428166e-05, + "loss": 0.9423, + "step": 14515 + }, + { + "epoch": 0.83, + "grad_norm": 0.25, + "learning_rate": 1.6488333097339335e-05, + "loss": 0.9255, + "step": 14520 + }, + { + "epoch": 0.83, + "grad_norm": 0.2578125, + "learning_rate": 1.64332871667626e-05, + "loss": 0.8827, + "step": 14525 + }, + { + "epoch": 0.83, + "grad_norm": 0.25390625, + "learning_rate": 1.6378325047758723e-05, + "loss": 0.9508, + "step": 14530 + }, + { + "epoch": 0.83, + "grad_norm": 0.2470703125, + "learning_rate": 1.6323446795450826e-05, + "loss": 0.9225, + "step": 14535 + }, + { + "epoch": 0.83, + "grad_norm": 0.275390625, + "learning_rate": 1.6268652464877988e-05, + "loss": 1.091, + "step": 14540 + }, + { + "epoch": 0.83, + "grad_norm": 0.25390625, + "learning_rate": 1.6213942110995105e-05, + "loss": 1.0095, + "step": 14545 + }, + { + "epoch": 0.83, + "grad_norm": 0.2890625, + "learning_rate": 1.6159315788672825e-05, + "loss": 1.0021, + "step": 14550 + }, + { + "epoch": 0.84, + "grad_norm": 0.2392578125, + "learning_rate": 1.6104773552697517e-05, + "loss": 0.9367, + "step": 14555 + }, + { + "epoch": 0.84, + "grad_norm": 0.236328125, + "learning_rate": 1.6050315457771257e-05, + "loss": 0.954, + "step": 14560 + }, + { + "epoch": 0.84, + "grad_norm": 0.24609375, + "learning_rate": 1.5995941558511695e-05, + "loss": 0.9226, + "step": 14565 + }, + { + "epoch": 0.84, + "grad_norm": 0.251953125, + "learning_rate": 1.5941651909452028e-05, + "loss": 0.9882, + "step": 14570 + }, + { + "epoch": 0.84, + "grad_norm": 0.26953125, + "learning_rate": 1.5887446565041007e-05, + "loss": 0.9663, + "step": 14575 + }, + { + "epoch": 0.84, + "grad_norm": 0.26953125, + "learning_rate": 1.583332557964282e-05, + "loss": 0.9891, + "step": 14580 + }, + { + "epoch": 0.84, + "grad_norm": 0.267578125, + "learning_rate": 1.5779289007537e-05, + "loss": 0.9168, + "step": 14585 + }, + { + "epoch": 0.84, + "grad_norm": 0.25, + "learning_rate": 1.5725336902918486e-05, + "loss": 0.952, + "step": 14590 + }, + { + "epoch": 0.84, + "grad_norm": 0.25, + "learning_rate": 1.5671469319897425e-05, + "loss": 0.9711, + "step": 14595 + }, + { + "epoch": 0.84, + "grad_norm": 0.275390625, + "learning_rate": 1.561768631249929e-05, + "loss": 0.9587, + "step": 14600 + }, + { + "epoch": 0.84, + "grad_norm": 0.37890625, + "learning_rate": 1.5563987934664624e-05, + "loss": 0.9205, + "step": 14605 + }, + { + "epoch": 0.84, + "grad_norm": 0.28125, + "learning_rate": 1.5510374240249205e-05, + "loss": 0.9584, + "step": 14610 + }, + { + "epoch": 0.84, + "grad_norm": 0.248046875, + "learning_rate": 1.5456845283023758e-05, + "loss": 0.9561, + "step": 14615 + }, + { + "epoch": 0.84, + "grad_norm": 0.2578125, + "learning_rate": 1.540340111667413e-05, + "loss": 0.8794, + "step": 14620 + }, + { + "epoch": 0.84, + "grad_norm": 0.2578125, + "learning_rate": 1.5350041794801097e-05, + "loss": 0.9399, + "step": 14625 + }, + { + "epoch": 0.84, + "grad_norm": 0.25390625, + "learning_rate": 1.5296767370920273e-05, + "loss": 0.9739, + "step": 14630 + }, + { + "epoch": 0.84, + "grad_norm": 0.251953125, + "learning_rate": 1.5243577898462246e-05, + "loss": 0.9206, + "step": 14635 + }, + { + "epoch": 0.84, + "grad_norm": 0.283203125, + "learning_rate": 1.5190473430772289e-05, + "loss": 1.0323, + "step": 14640 + }, + { + "epoch": 0.84, + "grad_norm": 0.26953125, + "learning_rate": 1.5137454021110508e-05, + "loss": 0.9263, + "step": 14645 + }, + { + "epoch": 0.84, + "grad_norm": 0.251953125, + "learning_rate": 1.5084519722651658e-05, + "loss": 0.8936, + "step": 14650 + }, + { + "epoch": 0.84, + "grad_norm": 0.25390625, + "learning_rate": 1.5031670588485103e-05, + "loss": 0.9072, + "step": 14655 + }, + { + "epoch": 0.84, + "grad_norm": 0.27734375, + "learning_rate": 1.497890667161489e-05, + "loss": 0.9722, + "step": 14660 + }, + { + "epoch": 0.84, + "grad_norm": 0.2353515625, + "learning_rate": 1.4926228024959487e-05, + "loss": 0.8629, + "step": 14665 + }, + { + "epoch": 0.84, + "grad_norm": 0.2470703125, + "learning_rate": 1.4873634701351946e-05, + "loss": 0.8868, + "step": 14670 + }, + { + "epoch": 0.84, + "grad_norm": 0.271484375, + "learning_rate": 1.4821126753539638e-05, + "loss": 0.98, + "step": 14675 + }, + { + "epoch": 0.84, + "grad_norm": 0.27734375, + "learning_rate": 1.4768704234184428e-05, + "loss": 0.9752, + "step": 14680 + }, + { + "epoch": 0.84, + "grad_norm": 0.267578125, + "learning_rate": 1.471636719586238e-05, + "loss": 0.9944, + "step": 14685 + }, + { + "epoch": 0.84, + "grad_norm": 0.251953125, + "learning_rate": 1.466411569106393e-05, + "loss": 0.9097, + "step": 14690 + }, + { + "epoch": 0.84, + "grad_norm": 0.26171875, + "learning_rate": 1.4611949772193657e-05, + "loss": 0.8881, + "step": 14695 + }, + { + "epoch": 0.84, + "grad_norm": 0.267578125, + "learning_rate": 1.4559869491570332e-05, + "loss": 0.8736, + "step": 14700 + }, + { + "epoch": 0.84, + "grad_norm": 0.255859375, + "learning_rate": 1.4507874901426877e-05, + "loss": 0.9889, + "step": 14705 + }, + { + "epoch": 0.84, + "grad_norm": 0.26953125, + "learning_rate": 1.4455966053910187e-05, + "loss": 0.9122, + "step": 14710 + }, + { + "epoch": 0.84, + "grad_norm": 0.259765625, + "learning_rate": 1.4404143001081238e-05, + "loss": 0.9335, + "step": 14715 + }, + { + "epoch": 0.84, + "grad_norm": 0.267578125, + "learning_rate": 1.4352405794914892e-05, + "loss": 0.934, + "step": 14720 + }, + { + "epoch": 0.84, + "grad_norm": 0.267578125, + "learning_rate": 1.4300754487300006e-05, + "loss": 0.9494, + "step": 14725 + }, + { + "epoch": 0.85, + "grad_norm": 0.26171875, + "learning_rate": 1.4249189130039175e-05, + "loss": 0.9182, + "step": 14730 + }, + { + "epoch": 0.85, + "grad_norm": 0.24609375, + "learning_rate": 1.419770977484891e-05, + "loss": 0.9858, + "step": 14735 + }, + { + "epoch": 0.85, + "grad_norm": 0.25390625, + "learning_rate": 1.4146316473359366e-05, + "loss": 0.9127, + "step": 14740 + }, + { + "epoch": 0.85, + "grad_norm": 0.248046875, + "learning_rate": 1.4095009277114412e-05, + "loss": 0.9651, + "step": 14745 + }, + { + "epoch": 0.85, + "grad_norm": 0.255859375, + "learning_rate": 1.4043788237571632e-05, + "loss": 0.9079, + "step": 14750 + }, + { + "epoch": 0.85, + "grad_norm": 0.265625, + "learning_rate": 1.3992653406102097e-05, + "loss": 0.9679, + "step": 14755 + }, + { + "epoch": 0.85, + "grad_norm": 0.248046875, + "learning_rate": 1.3941604833990528e-05, + "loss": 0.9932, + "step": 14760 + }, + { + "epoch": 0.85, + "grad_norm": 0.28125, + "learning_rate": 1.389064257243502e-05, + "loss": 0.8945, + "step": 14765 + }, + { + "epoch": 0.85, + "grad_norm": 0.2578125, + "learning_rate": 1.3839766672547206e-05, + "loss": 0.9169, + "step": 14770 + }, + { + "epoch": 0.85, + "grad_norm": 0.2578125, + "learning_rate": 1.3788977185352003e-05, + "loss": 0.9185, + "step": 14775 + }, + { + "epoch": 0.85, + "grad_norm": 0.25390625, + "learning_rate": 1.3738274161787768e-05, + "loss": 0.9105, + "step": 14780 + }, + { + "epoch": 0.85, + "grad_norm": 0.265625, + "learning_rate": 1.3687657652706076e-05, + "loss": 0.9342, + "step": 14785 + }, + { + "epoch": 0.85, + "grad_norm": 0.2373046875, + "learning_rate": 1.3637127708871734e-05, + "loss": 0.9612, + "step": 14790 + }, + { + "epoch": 0.85, + "grad_norm": 0.251953125, + "learning_rate": 1.3586684380962778e-05, + "loss": 0.9476, + "step": 14795 + }, + { + "epoch": 0.85, + "grad_norm": 0.244140625, + "learning_rate": 1.3536327719570286e-05, + "loss": 0.895, + "step": 14800 + }, + { + "epoch": 0.85, + "grad_norm": 0.25, + "learning_rate": 1.3486057775198535e-05, + "loss": 0.9524, + "step": 14805 + }, + { + "epoch": 0.85, + "grad_norm": 0.31640625, + "learning_rate": 1.3435874598264709e-05, + "loss": 1.0229, + "step": 14810 + }, + { + "epoch": 0.85, + "grad_norm": 0.251953125, + "learning_rate": 1.3385778239099067e-05, + "loss": 0.9131, + "step": 14815 + }, + { + "epoch": 0.85, + "grad_norm": 0.251953125, + "learning_rate": 1.3335768747944722e-05, + "loss": 0.8996, + "step": 14820 + }, + { + "epoch": 0.85, + "grad_norm": 0.25390625, + "learning_rate": 1.3285846174957728e-05, + "loss": 0.9554, + "step": 14825 + }, + { + "epoch": 0.85, + "grad_norm": 0.251953125, + "learning_rate": 1.3236010570206914e-05, + "loss": 0.9014, + "step": 14830 + }, + { + "epoch": 0.85, + "grad_norm": 0.2421875, + "learning_rate": 1.3186261983673942e-05, + "loss": 0.9185, + "step": 14835 + }, + { + "epoch": 0.85, + "grad_norm": 0.240234375, + "learning_rate": 1.3136600465253147e-05, + "loss": 0.9184, + "step": 14840 + }, + { + "epoch": 0.85, + "grad_norm": 0.248046875, + "learning_rate": 1.308702606475154e-05, + "loss": 0.9709, + "step": 14845 + }, + { + "epoch": 0.85, + "grad_norm": 0.2734375, + "learning_rate": 1.3037538831888819e-05, + "loss": 0.9465, + "step": 14850 + }, + { + "epoch": 0.85, + "grad_norm": 0.248046875, + "learning_rate": 1.298813881629718e-05, + "loss": 0.8841, + "step": 14855 + }, + { + "epoch": 0.85, + "grad_norm": 0.265625, + "learning_rate": 1.2938826067521404e-05, + "loss": 0.8585, + "step": 14860 + }, + { + "epoch": 0.85, + "grad_norm": 0.26171875, + "learning_rate": 1.2889600635018762e-05, + "loss": 1.0142, + "step": 14865 + }, + { + "epoch": 0.85, + "grad_norm": 0.30078125, + "learning_rate": 1.2840462568158874e-05, + "loss": 0.8854, + "step": 14870 + }, + { + "epoch": 0.85, + "grad_norm": 0.244140625, + "learning_rate": 1.2791411916223827e-05, + "loss": 0.9188, + "step": 14875 + }, + { + "epoch": 0.85, + "grad_norm": 0.28125, + "learning_rate": 1.2742448728407963e-05, + "loss": 0.9166, + "step": 14880 + }, + { + "epoch": 0.85, + "grad_norm": 0.25390625, + "learning_rate": 1.2693573053817976e-05, + "loss": 0.8888, + "step": 14885 + }, + { + "epoch": 0.85, + "grad_norm": 0.25390625, + "learning_rate": 1.2644784941472699e-05, + "loss": 0.9281, + "step": 14890 + }, + { + "epoch": 0.85, + "grad_norm": 0.2431640625, + "learning_rate": 1.2596084440303258e-05, + "loss": 0.9147, + "step": 14895 + }, + { + "epoch": 0.85, + "grad_norm": 0.2578125, + "learning_rate": 1.2547471599152804e-05, + "loss": 0.9626, + "step": 14900 + }, + { + "epoch": 0.86, + "grad_norm": 0.263671875, + "learning_rate": 1.2498946466776639e-05, + "loss": 0.9859, + "step": 14905 + }, + { + "epoch": 0.86, + "grad_norm": 0.291015625, + "learning_rate": 1.24505090918421e-05, + "loss": 0.9697, + "step": 14910 + }, + { + "epoch": 0.86, + "grad_norm": 0.26953125, + "learning_rate": 1.240215952292847e-05, + "loss": 0.9919, + "step": 14915 + }, + { + "epoch": 0.86, + "grad_norm": 0.25390625, + "learning_rate": 1.235389780852696e-05, + "loss": 0.9465, + "step": 14920 + }, + { + "epoch": 0.86, + "grad_norm": 0.255859375, + "learning_rate": 1.2305723997040752e-05, + "loss": 0.9491, + "step": 14925 + }, + { + "epoch": 0.86, + "grad_norm": 0.251953125, + "learning_rate": 1.2257638136784777e-05, + "loss": 0.931, + "step": 14930 + }, + { + "epoch": 0.86, + "grad_norm": 0.2578125, + "learning_rate": 1.2209640275985779e-05, + "loss": 0.9354, + "step": 14935 + }, + { + "epoch": 0.86, + "grad_norm": 0.259765625, + "learning_rate": 1.2161730462782283e-05, + "loss": 0.9203, + "step": 14940 + }, + { + "epoch": 0.86, + "grad_norm": 0.259765625, + "learning_rate": 1.211390874522449e-05, + "loss": 1.0151, + "step": 14945 + }, + { + "epoch": 0.86, + "grad_norm": 0.259765625, + "learning_rate": 1.2066175171274219e-05, + "loss": 0.947, + "step": 14950 + }, + { + "epoch": 0.86, + "grad_norm": 0.2294921875, + "learning_rate": 1.2018529788804932e-05, + "loss": 0.8716, + "step": 14955 + }, + { + "epoch": 0.86, + "grad_norm": 0.255859375, + "learning_rate": 1.1970972645601587e-05, + "loss": 0.9291, + "step": 14960 + }, + { + "epoch": 0.86, + "grad_norm": 0.251953125, + "learning_rate": 1.1923503789360712e-05, + "loss": 0.952, + "step": 14965 + }, + { + "epoch": 0.86, + "grad_norm": 0.2451171875, + "learning_rate": 1.1876123267690209e-05, + "loss": 0.9042, + "step": 14970 + }, + { + "epoch": 0.86, + "grad_norm": 0.251953125, + "learning_rate": 1.1828831128109475e-05, + "loss": 0.9913, + "step": 14975 + }, + { + "epoch": 0.86, + "grad_norm": 0.26953125, + "learning_rate": 1.1781627418049179e-05, + "loss": 0.9471, + "step": 14980 + }, + { + "epoch": 0.86, + "grad_norm": 0.2578125, + "learning_rate": 1.1734512184851377e-05, + "loss": 0.9306, + "step": 14985 + }, + { + "epoch": 0.86, + "grad_norm": 0.228515625, + "learning_rate": 1.1687485475769343e-05, + "loss": 0.9082, + "step": 14990 + }, + { + "epoch": 0.86, + "grad_norm": 0.267578125, + "learning_rate": 1.1640547337967577e-05, + "loss": 0.9856, + "step": 14995 + }, + { + "epoch": 0.86, + "grad_norm": 0.2451171875, + "learning_rate": 1.1593697818521765e-05, + "loss": 0.8916, + "step": 15000 + }, + { + "epoch": 0.86, + "grad_norm": 0.265625, + "learning_rate": 1.1546936964418664e-05, + "loss": 0.9843, + "step": 15005 + }, + { + "epoch": 0.86, + "grad_norm": 0.2470703125, + "learning_rate": 1.1500264822556194e-05, + "loss": 0.8997, + "step": 15010 + }, + { + "epoch": 0.86, + "grad_norm": 0.2392578125, + "learning_rate": 1.1453681439743224e-05, + "loss": 0.9233, + "step": 15015 + }, + { + "epoch": 0.86, + "grad_norm": 0.2353515625, + "learning_rate": 1.1407186862699614e-05, + "loss": 0.8995, + "step": 15020 + }, + { + "epoch": 0.86, + "grad_norm": 0.279296875, + "learning_rate": 1.1360781138056209e-05, + "loss": 0.9615, + "step": 15025 + }, + { + "epoch": 0.86, + "grad_norm": 0.2470703125, + "learning_rate": 1.1314464312354678e-05, + "loss": 1.0071, + "step": 15030 + }, + { + "epoch": 0.86, + "grad_norm": 0.26171875, + "learning_rate": 1.1268236432047596e-05, + "loss": 0.9332, + "step": 15035 + }, + { + "epoch": 0.86, + "grad_norm": 0.287109375, + "learning_rate": 1.1222097543498244e-05, + "loss": 0.9835, + "step": 15040 + }, + { + "epoch": 0.86, + "grad_norm": 0.26171875, + "learning_rate": 1.1176047692980773e-05, + "loss": 0.929, + "step": 15045 + }, + { + "epoch": 0.86, + "grad_norm": 0.296875, + "learning_rate": 1.1130086926679894e-05, + "loss": 0.9613, + "step": 15050 + }, + { + "epoch": 0.86, + "grad_norm": 0.251953125, + "learning_rate": 1.1084215290691092e-05, + "loss": 0.9264, + "step": 15055 + }, + { + "epoch": 0.86, + "grad_norm": 0.2490234375, + "learning_rate": 1.1038432831020384e-05, + "loss": 1.0485, + "step": 15060 + }, + { + "epoch": 0.86, + "grad_norm": 0.275390625, + "learning_rate": 1.099273959358439e-05, + "loss": 0.9286, + "step": 15065 + }, + { + "epoch": 0.86, + "grad_norm": 0.2470703125, + "learning_rate": 1.0947135624210247e-05, + "loss": 0.85, + "step": 15070 + }, + { + "epoch": 0.86, + "grad_norm": 0.255859375, + "learning_rate": 1.0901620968635517e-05, + "loss": 0.9062, + "step": 15075 + }, + { + "epoch": 0.87, + "grad_norm": 0.255859375, + "learning_rate": 1.0856195672508262e-05, + "loss": 0.9725, + "step": 15080 + }, + { + "epoch": 0.87, + "grad_norm": 0.302734375, + "learning_rate": 1.0810859781386828e-05, + "loss": 0.947, + "step": 15085 + }, + { + "epoch": 0.87, + "grad_norm": 0.28515625, + "learning_rate": 1.0765613340739989e-05, + "loss": 0.9843, + "step": 15090 + }, + { + "epoch": 0.87, + "grad_norm": 0.369140625, + "learning_rate": 1.0720456395946732e-05, + "loss": 1.0915, + "step": 15095 + }, + { + "epoch": 0.87, + "grad_norm": 0.2470703125, + "learning_rate": 1.0675388992296353e-05, + "loss": 0.8977, + "step": 15100 + }, + { + "epoch": 0.87, + "grad_norm": 0.251953125, + "learning_rate": 1.0630411174988275e-05, + "loss": 0.9742, + "step": 15105 + }, + { + "epoch": 0.87, + "grad_norm": 0.26953125, + "learning_rate": 1.0585522989132102e-05, + "loss": 0.9579, + "step": 15110 + }, + { + "epoch": 0.87, + "grad_norm": 0.28125, + "learning_rate": 1.0540724479747587e-05, + "loss": 0.9284, + "step": 15115 + }, + { + "epoch": 0.87, + "grad_norm": 0.26171875, + "learning_rate": 1.0496015691764461e-05, + "loss": 0.9307, + "step": 15120 + }, + { + "epoch": 0.87, + "grad_norm": 0.271484375, + "learning_rate": 1.045139667002254e-05, + "loss": 0.9269, + "step": 15125 + }, + { + "epoch": 0.87, + "grad_norm": 0.2578125, + "learning_rate": 1.0406867459271564e-05, + "loss": 0.8834, + "step": 15130 + }, + { + "epoch": 0.87, + "grad_norm": 0.28515625, + "learning_rate": 1.036242810417124e-05, + "loss": 0.8787, + "step": 15135 + }, + { + "epoch": 0.87, + "grad_norm": 0.248046875, + "learning_rate": 1.0318078649291119e-05, + "loss": 0.9375, + "step": 15140 + }, + { + "epoch": 0.87, + "grad_norm": 0.2490234375, + "learning_rate": 1.0273819139110608e-05, + "loss": 0.9264, + "step": 15145 + }, + { + "epoch": 0.87, + "grad_norm": 0.2353515625, + "learning_rate": 1.0229649618018933e-05, + "loss": 0.9261, + "step": 15150 + }, + { + "epoch": 0.87, + "grad_norm": 0.26171875, + "learning_rate": 1.0185570130314991e-05, + "loss": 0.9527, + "step": 15155 + }, + { + "epoch": 0.87, + "grad_norm": 0.25390625, + "learning_rate": 1.0141580720207466e-05, + "loss": 0.9477, + "step": 15160 + }, + { + "epoch": 0.87, + "grad_norm": 0.267578125, + "learning_rate": 1.0097681431814621e-05, + "loss": 1.0199, + "step": 15165 + }, + { + "epoch": 0.87, + "grad_norm": 0.302734375, + "learning_rate": 1.0053872309164414e-05, + "loss": 1.0186, + "step": 15170 + }, + { + "epoch": 0.87, + "grad_norm": 0.26171875, + "learning_rate": 1.00101533961943e-05, + "loss": 0.9518, + "step": 15175 + }, + { + "epoch": 0.87, + "grad_norm": 0.259765625, + "learning_rate": 9.966524736751337e-06, + "loss": 0.9136, + "step": 15180 + }, + { + "epoch": 0.87, + "grad_norm": 0.259765625, + "learning_rate": 9.922986374591969e-06, + "loss": 0.923, + "step": 15185 + }, + { + "epoch": 0.87, + "grad_norm": 0.25, + "learning_rate": 9.879538353382178e-06, + "loss": 0.9558, + "step": 15190 + }, + { + "epoch": 0.87, + "grad_norm": 0.26171875, + "learning_rate": 9.836180716697251e-06, + "loss": 0.9489, + "step": 15195 + }, + { + "epoch": 0.87, + "grad_norm": 0.244140625, + "learning_rate": 9.792913508021906e-06, + "loss": 0.9487, + "step": 15200 + }, + { + "epoch": 0.87, + "grad_norm": 0.26171875, + "learning_rate": 9.749736770750106e-06, + "loss": 0.953, + "step": 15205 + }, + { + "epoch": 0.87, + "grad_norm": 0.267578125, + "learning_rate": 9.706650548185091e-06, + "loss": 0.938, + "step": 15210 + }, + { + "epoch": 0.87, + "grad_norm": 0.259765625, + "learning_rate": 9.663654883539364e-06, + "loss": 0.9389, + "step": 15215 + }, + { + "epoch": 0.87, + "grad_norm": 0.259765625, + "learning_rate": 9.620749819934539e-06, + "loss": 0.9893, + "step": 15220 + }, + { + "epoch": 0.87, + "grad_norm": 0.287109375, + "learning_rate": 9.577935400401406e-06, + "loss": 0.9918, + "step": 15225 + }, + { + "epoch": 0.87, + "grad_norm": 0.2578125, + "learning_rate": 9.535211667879861e-06, + "loss": 0.9645, + "step": 15230 + }, + { + "epoch": 0.87, + "grad_norm": 0.25, + "learning_rate": 9.492578665218788e-06, + "loss": 0.9379, + "step": 15235 + }, + { + "epoch": 0.87, + "grad_norm": 0.259765625, + "learning_rate": 9.450036435176136e-06, + "loss": 0.9619, + "step": 15240 + }, + { + "epoch": 0.87, + "grad_norm": 0.275390625, + "learning_rate": 9.407585020418763e-06, + "loss": 0.9583, + "step": 15245 + }, + { + "epoch": 0.88, + "grad_norm": 0.2578125, + "learning_rate": 9.365224463522492e-06, + "loss": 0.8578, + "step": 15250 + }, + { + "epoch": 0.88, + "grad_norm": 0.244140625, + "learning_rate": 9.322954806971985e-06, + "loss": 0.9276, + "step": 15255 + }, + { + "epoch": 0.88, + "grad_norm": 0.25, + "learning_rate": 9.280776093160782e-06, + "loss": 0.9916, + "step": 15260 + }, + { + "epoch": 0.88, + "grad_norm": 0.27734375, + "learning_rate": 9.238688364391135e-06, + "loss": 0.9697, + "step": 15265 + }, + { + "epoch": 0.88, + "grad_norm": 0.25390625, + "learning_rate": 9.196691662874135e-06, + "loss": 0.9859, + "step": 15270 + }, + { + "epoch": 0.88, + "grad_norm": 0.248046875, + "learning_rate": 9.154786030729545e-06, + "loss": 0.9831, + "step": 15275 + }, + { + "epoch": 0.88, + "grad_norm": 0.265625, + "learning_rate": 9.112971509985757e-06, + "loss": 0.9049, + "step": 15280 + }, + { + "epoch": 0.88, + "grad_norm": 0.28515625, + "learning_rate": 9.071248142579825e-06, + "loss": 0.9565, + "step": 15285 + }, + { + "epoch": 0.88, + "grad_norm": 0.2578125, + "learning_rate": 9.029615970357375e-06, + "loss": 0.9601, + "step": 15290 + }, + { + "epoch": 0.88, + "grad_norm": 0.255859375, + "learning_rate": 8.988075035072562e-06, + "loss": 0.8574, + "step": 15295 + }, + { + "epoch": 0.88, + "grad_norm": 0.267578125, + "learning_rate": 8.946625378388029e-06, + "loss": 0.9204, + "step": 15300 + }, + { + "epoch": 0.88, + "grad_norm": 0.2734375, + "learning_rate": 8.905267041874887e-06, + "loss": 0.941, + "step": 15305 + }, + { + "epoch": 0.88, + "grad_norm": 0.26171875, + "learning_rate": 8.864000067012702e-06, + "loss": 0.9745, + "step": 15310 + }, + { + "epoch": 0.88, + "grad_norm": 0.25390625, + "learning_rate": 8.822824495189297e-06, + "loss": 0.9844, + "step": 15315 + }, + { + "epoch": 0.88, + "grad_norm": 0.255859375, + "learning_rate": 8.781740367700941e-06, + "loss": 0.8501, + "step": 15320 + }, + { + "epoch": 0.88, + "grad_norm": 0.244140625, + "learning_rate": 8.740747725752118e-06, + "loss": 0.9523, + "step": 15325 + }, + { + "epoch": 0.88, + "grad_norm": 0.2578125, + "learning_rate": 8.69984661045562e-06, + "loss": 0.952, + "step": 15330 + }, + { + "epoch": 0.88, + "grad_norm": 0.2578125, + "learning_rate": 8.659037062832342e-06, + "loss": 0.9723, + "step": 15335 + }, + { + "epoch": 0.88, + "grad_norm": 0.25, + "learning_rate": 8.61831912381148e-06, + "loss": 0.9169, + "step": 15340 + }, + { + "epoch": 0.88, + "grad_norm": 0.25, + "learning_rate": 8.577692834230223e-06, + "loss": 0.9249, + "step": 15345 + }, + { + "epoch": 0.88, + "grad_norm": 0.29296875, + "learning_rate": 8.53715823483393e-06, + "loss": 0.9725, + "step": 15350 + }, + { + "epoch": 0.88, + "grad_norm": 0.2578125, + "learning_rate": 8.496715366275976e-06, + "loss": 0.9398, + "step": 15355 + }, + { + "epoch": 0.88, + "grad_norm": 0.251953125, + "learning_rate": 8.456364269117711e-06, + "loss": 0.8724, + "step": 15360 + }, + { + "epoch": 0.88, + "grad_norm": 0.265625, + "learning_rate": 8.416104983828499e-06, + "loss": 0.9182, + "step": 15365 + }, + { + "epoch": 0.88, + "grad_norm": 0.265625, + "learning_rate": 8.375937550785539e-06, + "loss": 0.9904, + "step": 15370 + }, + { + "epoch": 0.88, + "grad_norm": 0.263671875, + "learning_rate": 8.335862010274009e-06, + "loss": 1.0124, + "step": 15375 + }, + { + "epoch": 0.88, + "grad_norm": 0.251953125, + "learning_rate": 8.295878402486856e-06, + "loss": 0.9373, + "step": 15380 + }, + { + "epoch": 0.88, + "grad_norm": 0.2451171875, + "learning_rate": 8.255986767524826e-06, + "loss": 0.892, + "step": 15385 + }, + { + "epoch": 0.88, + "grad_norm": 0.26171875, + "learning_rate": 8.21618714539647e-06, + "loss": 0.9043, + "step": 15390 + }, + { + "epoch": 0.88, + "grad_norm": 0.251953125, + "learning_rate": 8.176479576018003e-06, + "loss": 0.933, + "step": 15395 + }, + { + "epoch": 0.88, + "grad_norm": 0.267578125, + "learning_rate": 8.136864099213359e-06, + "loss": 0.9752, + "step": 15400 + }, + { + "epoch": 0.88, + "grad_norm": 0.27734375, + "learning_rate": 8.097340754714078e-06, + "loss": 0.9664, + "step": 15405 + }, + { + "epoch": 0.88, + "grad_norm": 0.2490234375, + "learning_rate": 8.05790958215934e-06, + "loss": 0.8931, + "step": 15410 + }, + { + "epoch": 0.88, + "grad_norm": 0.25390625, + "learning_rate": 8.018570621095822e-06, + "loss": 0.9852, + "step": 15415 + }, + { + "epoch": 0.88, + "grad_norm": 0.267578125, + "learning_rate": 7.979323910977787e-06, + "loss": 0.95, + "step": 15420 + }, + { + "epoch": 0.89, + "grad_norm": 0.265625, + "learning_rate": 7.940169491166904e-06, + "loss": 0.9871, + "step": 15425 + }, + { + "epoch": 0.89, + "grad_norm": 0.26171875, + "learning_rate": 7.90110740093234e-06, + "loss": 0.8466, + "step": 15430 + }, + { + "epoch": 0.89, + "grad_norm": 0.251953125, + "learning_rate": 7.862137679450653e-06, + "loss": 0.8754, + "step": 15435 + }, + { + "epoch": 0.89, + "grad_norm": 0.27734375, + "learning_rate": 7.823260365805717e-06, + "loss": 0.9045, + "step": 15440 + }, + { + "epoch": 0.89, + "grad_norm": 0.251953125, + "learning_rate": 7.784475498988808e-06, + "loss": 0.9642, + "step": 15445 + }, + { + "epoch": 0.89, + "grad_norm": 0.255859375, + "learning_rate": 7.745783117898397e-06, + "loss": 0.9163, + "step": 15450 + }, + { + "epoch": 0.89, + "grad_norm": 0.25390625, + "learning_rate": 7.707183261340255e-06, + "loss": 1.0132, + "step": 15455 + }, + { + "epoch": 0.89, + "grad_norm": 0.25390625, + "learning_rate": 7.668675968027328e-06, + "loss": 0.9522, + "step": 15460 + }, + { + "epoch": 0.89, + "grad_norm": 0.25390625, + "learning_rate": 7.630261276579765e-06, + "loss": 0.9135, + "step": 15465 + }, + { + "epoch": 0.89, + "grad_norm": 0.244140625, + "learning_rate": 7.5919392255248025e-06, + "loss": 0.983, + "step": 15470 + }, + { + "epoch": 0.89, + "grad_norm": 0.271484375, + "learning_rate": 7.553709853296764e-06, + "loss": 0.9665, + "step": 15475 + }, + { + "epoch": 0.89, + "grad_norm": 0.25390625, + "learning_rate": 7.515573198237069e-06, + "loss": 0.9158, + "step": 15480 + }, + { + "epoch": 0.89, + "grad_norm": 0.2294921875, + "learning_rate": 7.477529298594077e-06, + "loss": 0.9633, + "step": 15485 + }, + { + "epoch": 0.89, + "grad_norm": 0.26171875, + "learning_rate": 7.439578192523211e-06, + "loss": 0.9665, + "step": 15490 + }, + { + "epoch": 0.89, + "grad_norm": 0.251953125, + "learning_rate": 7.4017199180867246e-06, + "loss": 1.0238, + "step": 15495 + }, + { + "epoch": 0.89, + "grad_norm": 0.2421875, + "learning_rate": 7.363954513253879e-06, + "loss": 0.955, + "step": 15500 + }, + { + "epoch": 0.89, + "grad_norm": 0.248046875, + "learning_rate": 7.3262820159006765e-06, + "loss": 0.857, + "step": 15505 + }, + { + "epoch": 0.89, + "grad_norm": 0.244140625, + "learning_rate": 7.288702463810026e-06, + "loss": 0.958, + "step": 15510 + }, + { + "epoch": 0.89, + "grad_norm": 0.28125, + "learning_rate": 7.2512158946716145e-06, + "loss": 0.9374, + "step": 15515 + }, + { + "epoch": 0.89, + "grad_norm": 0.2490234375, + "learning_rate": 7.213822346081822e-06, + "loss": 0.9358, + "step": 15520 + }, + { + "epoch": 0.89, + "grad_norm": 0.265625, + "learning_rate": 7.176521855543772e-06, + "loss": 0.962, + "step": 15525 + }, + { + "epoch": 0.89, + "grad_norm": 0.2734375, + "learning_rate": 7.13931446046725e-06, + "loss": 0.9796, + "step": 15530 + }, + { + "epoch": 0.89, + "grad_norm": 0.2578125, + "learning_rate": 7.1022001981686845e-06, + "loss": 0.9309, + "step": 15535 + }, + { + "epoch": 0.89, + "grad_norm": 0.28125, + "learning_rate": 7.065179105871056e-06, + "loss": 0.9524, + "step": 15540 + }, + { + "epoch": 0.89, + "grad_norm": 0.255859375, + "learning_rate": 7.028251220703974e-06, + "loss": 0.9123, + "step": 15545 + }, + { + "epoch": 0.89, + "grad_norm": 0.26953125, + "learning_rate": 6.99141657970348e-06, + "loss": 0.9028, + "step": 15550 + }, + { + "epoch": 0.89, + "grad_norm": 0.2470703125, + "learning_rate": 6.954675219812201e-06, + "loss": 0.9712, + "step": 15555 + }, + { + "epoch": 0.89, + "grad_norm": 0.26171875, + "learning_rate": 6.918027177879094e-06, + "loss": 0.9409, + "step": 15560 + }, + { + "epoch": 0.89, + "grad_norm": 0.275390625, + "learning_rate": 6.881472490659635e-06, + "loss": 0.9266, + "step": 15565 + }, + { + "epoch": 0.89, + "grad_norm": 0.255859375, + "learning_rate": 6.845011194815598e-06, + "loss": 0.9021, + "step": 15570 + }, + { + "epoch": 0.89, + "grad_norm": 0.263671875, + "learning_rate": 6.808643326915087e-06, + "loss": 0.9928, + "step": 15575 + }, + { + "epoch": 0.89, + "grad_norm": 0.267578125, + "learning_rate": 6.772368923432593e-06, + "loss": 0.9648, + "step": 15580 + }, + { + "epoch": 0.89, + "grad_norm": 0.271484375, + "learning_rate": 6.736188020748746e-06, + "loss": 0.9538, + "step": 15585 + }, + { + "epoch": 0.89, + "grad_norm": 0.263671875, + "learning_rate": 6.700100655150487e-06, + "loss": 0.9885, + "step": 15590 + }, + { + "epoch": 0.89, + "grad_norm": 0.255859375, + "learning_rate": 6.6641068628309545e-06, + "loss": 0.9485, + "step": 15595 + }, + { + "epoch": 0.9, + "grad_norm": 0.287109375, + "learning_rate": 6.628206679889349e-06, + "loss": 0.9249, + "step": 15600 + }, + { + "epoch": 0.9, + "grad_norm": 0.28125, + "learning_rate": 6.5924001423311014e-06, + "loss": 0.9412, + "step": 15605 + }, + { + "epoch": 0.9, + "grad_norm": 0.2451171875, + "learning_rate": 6.55668728606762e-06, + "loss": 0.8892, + "step": 15610 + }, + { + "epoch": 0.9, + "grad_norm": 0.2392578125, + "learning_rate": 6.521068146916432e-06, + "loss": 0.9258, + "step": 15615 + }, + { + "epoch": 0.9, + "grad_norm": 0.24609375, + "learning_rate": 6.485542760601027e-06, + "loss": 0.9151, + "step": 15620 + }, + { + "epoch": 0.9, + "grad_norm": 0.27734375, + "learning_rate": 6.450111162750905e-06, + "loss": 0.9967, + "step": 15625 + }, + { + "epoch": 0.9, + "grad_norm": 0.2412109375, + "learning_rate": 6.414773388901452e-06, + "loss": 0.872, + "step": 15630 + }, + { + "epoch": 0.9, + "grad_norm": 0.26171875, + "learning_rate": 6.379529474494006e-06, + "loss": 0.8878, + "step": 15635 + }, + { + "epoch": 0.9, + "grad_norm": 0.251953125, + "learning_rate": 6.344379454875771e-06, + "loss": 0.9422, + "step": 15640 + }, + { + "epoch": 0.9, + "grad_norm": 0.2470703125, + "learning_rate": 6.309323365299724e-06, + "loss": 0.9509, + "step": 15645 + }, + { + "epoch": 0.9, + "grad_norm": 0.2734375, + "learning_rate": 6.274361240924676e-06, + "loss": 0.9587, + "step": 15650 + }, + { + "epoch": 0.9, + "grad_norm": 0.2373046875, + "learning_rate": 6.239493116815231e-06, + "loss": 0.9111, + "step": 15655 + }, + { + "epoch": 0.9, + "grad_norm": 0.26171875, + "learning_rate": 6.204719027941641e-06, + "loss": 0.8894, + "step": 15660 + }, + { + "epoch": 0.9, + "grad_norm": 0.248046875, + "learning_rate": 6.170039009179895e-06, + "loss": 0.8962, + "step": 15665 + }, + { + "epoch": 0.9, + "grad_norm": 0.2890625, + "learning_rate": 6.135453095311627e-06, + "loss": 0.923, + "step": 15670 + }, + { + "epoch": 0.9, + "grad_norm": 0.2470703125, + "learning_rate": 6.100961321024112e-06, + "loss": 0.9691, + "step": 15675 + }, + { + "epoch": 0.9, + "grad_norm": 0.25, + "learning_rate": 6.066563720910168e-06, + "loss": 1.0003, + "step": 15680 + }, + { + "epoch": 0.9, + "grad_norm": 0.2578125, + "learning_rate": 6.032260329468198e-06, + "loss": 0.9405, + "step": 15685 + }, + { + "epoch": 0.9, + "grad_norm": 0.26171875, + "learning_rate": 5.998051181102082e-06, + "loss": 0.8734, + "step": 15690 + }, + { + "epoch": 0.9, + "grad_norm": 0.255859375, + "learning_rate": 5.963936310121243e-06, + "loss": 0.9619, + "step": 15695 + }, + { + "epoch": 0.9, + "grad_norm": 0.248046875, + "learning_rate": 5.929915750740478e-06, + "loss": 0.9116, + "step": 15700 + }, + { + "epoch": 0.9, + "grad_norm": 0.2490234375, + "learning_rate": 5.89598953708006e-06, + "loss": 0.9071, + "step": 15705 + }, + { + "epoch": 0.9, + "grad_norm": 0.26171875, + "learning_rate": 5.862157703165583e-06, + "loss": 0.9501, + "step": 15710 + }, + { + "epoch": 0.9, + "grad_norm": 0.251953125, + "learning_rate": 5.828420282928016e-06, + "loss": 0.9906, + "step": 15715 + }, + { + "epoch": 0.9, + "grad_norm": 0.263671875, + "learning_rate": 5.794777310203658e-06, + "loss": 0.9415, + "step": 15720 + }, + { + "epoch": 0.9, + "grad_norm": 0.25, + "learning_rate": 5.761228818734032e-06, + "loss": 0.9114, + "step": 15725 + }, + { + "epoch": 0.9, + "grad_norm": 0.2470703125, + "learning_rate": 5.727774842165956e-06, + "loss": 0.9124, + "step": 15730 + }, + { + "epoch": 0.9, + "grad_norm": 0.267578125, + "learning_rate": 5.694415414051402e-06, + "loss": 0.8599, + "step": 15735 + }, + { + "epoch": 0.9, + "grad_norm": 0.263671875, + "learning_rate": 5.6611505678475726e-06, + "loss": 0.9767, + "step": 15740 + }, + { + "epoch": 0.9, + "grad_norm": 0.259765625, + "learning_rate": 5.627980336916772e-06, + "loss": 0.9614, + "step": 15745 + }, + { + "epoch": 0.9, + "grad_norm": 0.263671875, + "learning_rate": 5.594904754526398e-06, + "loss": 0.9834, + "step": 15750 + }, + { + "epoch": 0.9, + "grad_norm": 0.2578125, + "learning_rate": 5.561923853848994e-06, + "loss": 0.962, + "step": 15755 + }, + { + "epoch": 0.9, + "grad_norm": 0.25, + "learning_rate": 5.529037667962067e-06, + "loss": 0.9628, + "step": 15760 + }, + { + "epoch": 0.9, + "grad_norm": 0.322265625, + "learning_rate": 5.496246229848179e-06, + "loss": 0.9037, + "step": 15765 + }, + { + "epoch": 0.9, + "grad_norm": 0.255859375, + "learning_rate": 5.463549572394833e-06, + "loss": 0.8848, + "step": 15770 + }, + { + "epoch": 0.91, + "grad_norm": 0.244140625, + "learning_rate": 5.4309477283945194e-06, + "loss": 0.89, + "step": 15775 + }, + { + "epoch": 0.91, + "grad_norm": 0.2578125, + "learning_rate": 5.3984407305445736e-06, + "loss": 0.9684, + "step": 15780 + }, + { + "epoch": 0.91, + "grad_norm": 0.263671875, + "learning_rate": 5.366028611447282e-06, + "loss": 1.0014, + "step": 15785 + }, + { + "epoch": 0.91, + "grad_norm": 0.244140625, + "learning_rate": 5.333711403609698e-06, + "loss": 0.9274, + "step": 15790 + }, + { + "epoch": 0.91, + "grad_norm": 0.2421875, + "learning_rate": 5.301489139443738e-06, + "loss": 0.9214, + "step": 15795 + }, + { + "epoch": 0.91, + "grad_norm": 0.271484375, + "learning_rate": 5.269361851266097e-06, + "loss": 1.0203, + "step": 15800 + }, + { + "epoch": 0.91, + "grad_norm": 0.27734375, + "learning_rate": 5.237329571298166e-06, + "loss": 0.9583, + "step": 15805 + }, + { + "epoch": 0.91, + "grad_norm": 0.28515625, + "learning_rate": 5.205392331666126e-06, + "loss": 0.9585, + "step": 15810 + }, + { + "epoch": 0.91, + "grad_norm": 0.26171875, + "learning_rate": 5.173550164400753e-06, + "loss": 0.9791, + "step": 15815 + }, + { + "epoch": 0.91, + "grad_norm": 0.287109375, + "learning_rate": 5.141803101437559e-06, + "loss": 0.9917, + "step": 15820 + }, + { + "epoch": 0.91, + "grad_norm": 0.234375, + "learning_rate": 5.110151174616584e-06, + "loss": 0.851, + "step": 15825 + }, + { + "epoch": 0.91, + "grad_norm": 0.271484375, + "learning_rate": 5.078594415682536e-06, + "loss": 0.9701, + "step": 15830 + }, + { + "epoch": 0.91, + "grad_norm": 0.287109375, + "learning_rate": 5.04713285628462e-06, + "loss": 0.9642, + "step": 15835 + }, + { + "epoch": 0.91, + "grad_norm": 0.2578125, + "learning_rate": 5.015766527976573e-06, + "loss": 0.9995, + "step": 15840 + }, + { + "epoch": 0.91, + "grad_norm": 0.263671875, + "learning_rate": 4.984495462216654e-06, + "loss": 0.9138, + "step": 15845 + }, + { + "epoch": 0.91, + "grad_norm": 0.259765625, + "learning_rate": 4.953319690367531e-06, + "loss": 1.0434, + "step": 15850 + }, + { + "epoch": 0.91, + "grad_norm": 0.283203125, + "learning_rate": 4.922239243696358e-06, + "loss": 0.9751, + "step": 15855 + }, + { + "epoch": 0.91, + "grad_norm": 0.2490234375, + "learning_rate": 4.891254153374614e-06, + "loss": 0.8599, + "step": 15860 + }, + { + "epoch": 0.91, + "grad_norm": 0.2734375, + "learning_rate": 4.860364450478206e-06, + "loss": 1.0107, + "step": 15865 + }, + { + "epoch": 0.91, + "grad_norm": 0.251953125, + "learning_rate": 4.829570165987318e-06, + "loss": 0.9392, + "step": 15870 + }, + { + "epoch": 0.91, + "grad_norm": 0.2421875, + "learning_rate": 4.798871330786492e-06, + "loss": 0.9362, + "step": 15875 + }, + { + "epoch": 0.91, + "grad_norm": 0.248046875, + "learning_rate": 4.7682679756645195e-06, + "loss": 0.9688, + "step": 15880 + }, + { + "epoch": 0.91, + "grad_norm": 0.2412109375, + "learning_rate": 4.737760131314406e-06, + "loss": 0.8777, + "step": 15885 + }, + { + "epoch": 0.91, + "grad_norm": 0.27734375, + "learning_rate": 4.707347828333408e-06, + "loss": 0.9369, + "step": 15890 + }, + { + "epoch": 0.91, + "grad_norm": 0.2734375, + "learning_rate": 4.677031097222906e-06, + "loss": 0.8431, + "step": 15895 + }, + { + "epoch": 0.91, + "grad_norm": 0.26171875, + "learning_rate": 4.646809968388499e-06, + "loss": 0.946, + "step": 15900 + }, + { + "epoch": 0.91, + "grad_norm": 0.26171875, + "learning_rate": 4.616684472139842e-06, + "loss": 0.9039, + "step": 15905 + }, + { + "epoch": 0.91, + "grad_norm": 0.2578125, + "learning_rate": 4.586654638690724e-06, + "loss": 0.9346, + "step": 15910 + }, + { + "epoch": 0.91, + "grad_norm": 0.271484375, + "learning_rate": 4.55672049815895e-06, + "loss": 0.9267, + "step": 15915 + }, + { + "epoch": 0.91, + "grad_norm": 0.267578125, + "learning_rate": 4.5268820805663855e-06, + "loss": 0.885, + "step": 15920 + }, + { + "epoch": 0.91, + "grad_norm": 0.26171875, + "learning_rate": 4.497139415838858e-06, + "loss": 0.9064, + "step": 15925 + }, + { + "epoch": 0.91, + "grad_norm": 0.25, + "learning_rate": 4.467492533806217e-06, + "loss": 0.9358, + "step": 15930 + }, + { + "epoch": 0.91, + "grad_norm": 0.255859375, + "learning_rate": 4.437941464202177e-06, + "loss": 0.933, + "step": 15935 + }, + { + "epoch": 0.91, + "grad_norm": 0.26171875, + "learning_rate": 4.408486236664411e-06, + "loss": 0.9991, + "step": 15940 + }, + { + "epoch": 0.91, + "grad_norm": 0.251953125, + "learning_rate": 4.379126880734463e-06, + "loss": 0.8015, + "step": 15945 + }, + { + "epoch": 0.92, + "grad_norm": 0.2412109375, + "learning_rate": 4.349863425857681e-06, + "loss": 0.8728, + "step": 15950 + }, + { + "epoch": 0.92, + "grad_norm": 0.248046875, + "learning_rate": 4.320695901383298e-06, + "loss": 0.9851, + "step": 15955 + }, + { + "epoch": 0.92, + "grad_norm": 0.2470703125, + "learning_rate": 4.291624336564304e-06, + "loss": 0.919, + "step": 15960 + }, + { + "epoch": 0.92, + "grad_norm": 0.255859375, + "learning_rate": 4.26264876055743e-06, + "loss": 0.9507, + "step": 15965 + }, + { + "epoch": 0.92, + "grad_norm": 0.248046875, + "learning_rate": 4.233769202423166e-06, + "loss": 0.9137, + "step": 15970 + }, + { + "epoch": 0.92, + "grad_norm": 0.25390625, + "learning_rate": 4.204985691125674e-06, + "loss": 0.9431, + "step": 15975 + }, + { + "epoch": 0.92, + "grad_norm": 0.24609375, + "learning_rate": 4.176298255532829e-06, + "loss": 0.9374, + "step": 15980 + }, + { + "epoch": 0.92, + "grad_norm": 0.263671875, + "learning_rate": 4.147706924416095e-06, + "loss": 0.9224, + "step": 15985 + }, + { + "epoch": 0.92, + "grad_norm": 0.26953125, + "learning_rate": 4.119211726450611e-06, + "loss": 0.9349, + "step": 15990 + }, + { + "epoch": 0.92, + "grad_norm": 0.265625, + "learning_rate": 4.0908126902150404e-06, + "loss": 0.995, + "step": 15995 + }, + { + "epoch": 0.92, + "grad_norm": 0.2392578125, + "learning_rate": 4.062509844191631e-06, + "loss": 0.9509, + "step": 16000 + }, + { + "epoch": 0.92, + "grad_norm": 0.26953125, + "learning_rate": 4.0343032167662e-06, + "loss": 0.9903, + "step": 16005 + }, + { + "epoch": 0.92, + "grad_norm": 0.29296875, + "learning_rate": 4.0061928362279846e-06, + "loss": 0.9883, + "step": 16010 + }, + { + "epoch": 0.92, + "grad_norm": 0.2470703125, + "learning_rate": 3.978178730769733e-06, + "loss": 0.9506, + "step": 16015 + }, + { + "epoch": 0.92, + "grad_norm": 0.25390625, + "learning_rate": 3.95026092848767e-06, + "loss": 0.8995, + "step": 16020 + }, + { + "epoch": 0.92, + "grad_norm": 0.25, + "learning_rate": 3.922439457381355e-06, + "loss": 0.9902, + "step": 16025 + }, + { + "epoch": 0.92, + "grad_norm": 0.259765625, + "learning_rate": 3.894714345353801e-06, + "loss": 0.8852, + "step": 16030 + }, + { + "epoch": 0.92, + "grad_norm": 0.265625, + "learning_rate": 3.867085620211352e-06, + "loss": 0.991, + "step": 16035 + }, + { + "epoch": 0.92, + "grad_norm": 0.251953125, + "learning_rate": 3.8395533096637015e-06, + "loss": 0.9524, + "step": 16040 + }, + { + "epoch": 0.92, + "grad_norm": 0.251953125, + "learning_rate": 3.8121174413238257e-06, + "loss": 0.8911, + "step": 16045 + }, + { + "epoch": 0.92, + "grad_norm": 0.255859375, + "learning_rate": 3.7847780427079814e-06, + "loss": 0.953, + "step": 16050 + }, + { + "epoch": 0.92, + "grad_norm": 0.259765625, + "learning_rate": 3.7575351412356576e-06, + "loss": 0.9258, + "step": 16055 + }, + { + "epoch": 0.92, + "grad_norm": 0.263671875, + "learning_rate": 3.73038876422962e-06, + "loss": 0.935, + "step": 16060 + }, + { + "epoch": 0.92, + "grad_norm": 0.267578125, + "learning_rate": 3.7033389389157567e-06, + "loss": 0.9546, + "step": 16065 + }, + { + "epoch": 0.92, + "grad_norm": 0.26171875, + "learning_rate": 3.6763856924231678e-06, + "loss": 0.9483, + "step": 16070 + }, + { + "epoch": 0.92, + "grad_norm": 0.251953125, + "learning_rate": 3.649529051784051e-06, + "loss": 0.9712, + "step": 16075 + }, + { + "epoch": 0.92, + "grad_norm": 0.25390625, + "learning_rate": 3.62276904393376e-06, + "loss": 0.9029, + "step": 16080 + }, + { + "epoch": 0.92, + "grad_norm": 0.25390625, + "learning_rate": 3.5961056957107273e-06, + "loss": 0.9028, + "step": 16085 + }, + { + "epoch": 0.92, + "grad_norm": 0.255859375, + "learning_rate": 3.569539033856406e-06, + "loss": 0.9829, + "step": 16090 + }, + { + "epoch": 0.92, + "grad_norm": 0.259765625, + "learning_rate": 3.5430690850153046e-06, + "loss": 1.0096, + "step": 16095 + }, + { + "epoch": 0.92, + "grad_norm": 0.251953125, + "learning_rate": 3.516695875734932e-06, + "loss": 0.8646, + "step": 16100 + }, + { + "epoch": 0.92, + "grad_norm": 0.271484375, + "learning_rate": 3.4904194324657748e-06, + "loss": 0.9332, + "step": 16105 + }, + { + "epoch": 0.92, + "grad_norm": 0.255859375, + "learning_rate": 3.4642397815612847e-06, + "loss": 0.9654, + "step": 16110 + }, + { + "epoch": 0.92, + "grad_norm": 0.27734375, + "learning_rate": 3.438156949277782e-06, + "loss": 0.9824, + "step": 16115 + }, + { + "epoch": 0.92, + "grad_norm": 0.26171875, + "learning_rate": 3.4121709617745745e-06, + "loss": 0.8956, + "step": 16120 + }, + { + "epoch": 0.93, + "grad_norm": 0.275390625, + "learning_rate": 3.386281845113748e-06, + "loss": 0.9375, + "step": 16125 + }, + { + "epoch": 0.93, + "grad_norm": 0.2470703125, + "learning_rate": 3.3604896252603104e-06, + "loss": 0.9401, + "step": 16130 + }, + { + "epoch": 0.93, + "grad_norm": 0.287109375, + "learning_rate": 3.334794328082025e-06, + "loss": 0.9871, + "step": 16135 + }, + { + "epoch": 0.93, + "grad_norm": 0.265625, + "learning_rate": 3.3091959793495107e-06, + "loss": 0.9283, + "step": 16140 + }, + { + "epoch": 0.93, + "grad_norm": 0.263671875, + "learning_rate": 3.2836946047360982e-06, + "loss": 0.9401, + "step": 16145 + }, + { + "epoch": 0.93, + "grad_norm": 0.24609375, + "learning_rate": 3.2582902298178953e-06, + "loss": 0.935, + "step": 16150 + }, + { + "epoch": 0.93, + "grad_norm": 0.263671875, + "learning_rate": 3.2329828800737096e-06, + "loss": 0.9749, + "step": 16155 + }, + { + "epoch": 0.93, + "grad_norm": 0.26953125, + "learning_rate": 3.207772580885049e-06, + "loss": 0.9388, + "step": 16160 + }, + { + "epoch": 0.93, + "grad_norm": 0.25390625, + "learning_rate": 3.1826593575360887e-06, + "loss": 0.9138, + "step": 16165 + }, + { + "epoch": 0.93, + "grad_norm": 0.25390625, + "learning_rate": 3.1576432352136144e-06, + "loss": 0.9225, + "step": 16170 + }, + { + "epoch": 0.93, + "grad_norm": 0.27734375, + "learning_rate": 3.1327242390070677e-06, + "loss": 0.9162, + "step": 16175 + }, + { + "epoch": 0.93, + "grad_norm": 0.265625, + "learning_rate": 3.1079023939084573e-06, + "loss": 0.899, + "step": 16180 + }, + { + "epoch": 0.93, + "grad_norm": 0.255859375, + "learning_rate": 3.0831777248123584e-06, + "loss": 0.9189, + "step": 16185 + }, + { + "epoch": 0.93, + "grad_norm": 0.25390625, + "learning_rate": 3.0585502565158687e-06, + "loss": 0.8877, + "step": 16190 + }, + { + "epoch": 0.93, + "grad_norm": 0.251953125, + "learning_rate": 3.0340200137186526e-06, + "loss": 0.8927, + "step": 16195 + }, + { + "epoch": 0.93, + "grad_norm": 0.3515625, + "learning_rate": 3.0095870210228083e-06, + "loss": 0.9733, + "step": 16200 + }, + { + "epoch": 0.93, + "grad_norm": 0.248046875, + "learning_rate": 2.985251302932912e-06, + "loss": 0.9776, + "step": 16205 + }, + { + "epoch": 0.93, + "grad_norm": 0.267578125, + "learning_rate": 2.9610128838560068e-06, + "loss": 0.8292, + "step": 16210 + }, + { + "epoch": 0.93, + "grad_norm": 0.26171875, + "learning_rate": 2.936871788101514e-06, + "loss": 0.9053, + "step": 16215 + }, + { + "epoch": 0.93, + "grad_norm": 0.25390625, + "learning_rate": 2.912828039881288e-06, + "loss": 0.8882, + "step": 16220 + }, + { + "epoch": 0.93, + "grad_norm": 0.255859375, + "learning_rate": 2.8888816633095063e-06, + "loss": 0.9554, + "step": 16225 + }, + { + "epoch": 0.93, + "grad_norm": 0.265625, + "learning_rate": 2.865032682402724e-06, + "loss": 0.9225, + "step": 16230 + }, + { + "epoch": 0.93, + "grad_norm": 0.255859375, + "learning_rate": 2.8412811210797975e-06, + "loss": 0.9057, + "step": 16235 + }, + { + "epoch": 0.93, + "grad_norm": 0.2734375, + "learning_rate": 2.817627003161882e-06, + "loss": 0.9084, + "step": 16240 + }, + { + "epoch": 0.93, + "grad_norm": 0.2890625, + "learning_rate": 2.7940703523724134e-06, + "loss": 0.8942, + "step": 16245 + }, + { + "epoch": 0.93, + "grad_norm": 0.255859375, + "learning_rate": 2.770611192337047e-06, + "loss": 0.9854, + "step": 16250 + }, + { + "epoch": 0.93, + "grad_norm": 0.265625, + "learning_rate": 2.747249546583708e-06, + "loss": 0.9788, + "step": 16255 + }, + { + "epoch": 0.93, + "grad_norm": 0.259765625, + "learning_rate": 2.7239854385424645e-06, + "loss": 0.848, + "step": 16260 + }, + { + "epoch": 0.93, + "grad_norm": 0.275390625, + "learning_rate": 2.70081889154562e-06, + "loss": 0.9951, + "step": 16265 + }, + { + "epoch": 0.93, + "grad_norm": 0.267578125, + "learning_rate": 2.6777499288275775e-06, + "loss": 1.004, + "step": 16270 + }, + { + "epoch": 0.93, + "grad_norm": 0.2470703125, + "learning_rate": 2.6547785735249187e-06, + "loss": 0.9352, + "step": 16275 + }, + { + "epoch": 0.93, + "grad_norm": 0.265625, + "learning_rate": 2.631904848676281e-06, + "loss": 0.9755, + "step": 16280 + }, + { + "epoch": 0.93, + "grad_norm": 0.271484375, + "learning_rate": 2.609128777222447e-06, + "loss": 1.0406, + "step": 16285 + }, + { + "epoch": 0.93, + "grad_norm": 0.255859375, + "learning_rate": 2.586450382006178e-06, + "loss": 0.89, + "step": 16290 + }, + { + "epoch": 0.93, + "grad_norm": 0.279296875, + "learning_rate": 2.563869685772358e-06, + "loss": 0.9417, + "step": 16295 + }, + { + "epoch": 0.94, + "grad_norm": 0.24609375, + "learning_rate": 2.5413867111678367e-06, + "loss": 0.978, + "step": 16300 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 2.5190014807414454e-06, + "loss": 0.9096, + "step": 16305 + }, + { + "epoch": 0.94, + "grad_norm": 0.24609375, + "learning_rate": 2.4967140169440464e-06, + "loss": 0.9473, + "step": 16310 + }, + { + "epoch": 0.94, + "grad_norm": 0.2470703125, + "learning_rate": 2.4745243421283706e-06, + "loss": 0.9828, + "step": 16315 + }, + { + "epoch": 0.94, + "grad_norm": 0.265625, + "learning_rate": 2.452432478549116e-06, + "loss": 0.9684, + "step": 16320 + }, + { + "epoch": 0.94, + "grad_norm": 0.263671875, + "learning_rate": 2.430438448362915e-06, + "loss": 0.9518, + "step": 16325 + }, + { + "epoch": 0.94, + "grad_norm": 0.298828125, + "learning_rate": 2.408542273628189e-06, + "loss": 0.9734, + "step": 16330 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 2.3867439763053166e-06, + "loss": 0.8628, + "step": 16335 + }, + { + "epoch": 0.94, + "grad_norm": 0.271484375, + "learning_rate": 2.3650435782564427e-06, + "loss": 1.007, + "step": 16340 + }, + { + "epoch": 0.94, + "grad_norm": 0.25, + "learning_rate": 2.343441101245558e-06, + "loss": 0.9013, + "step": 16345 + }, + { + "epoch": 0.94, + "grad_norm": 0.265625, + "learning_rate": 2.3219365669384206e-06, + "loss": 0.9517, + "step": 16350 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 2.3005299969026117e-06, + "loss": 1.0107, + "step": 16355 + }, + { + "epoch": 0.94, + "grad_norm": 0.2392578125, + "learning_rate": 2.2792214126073797e-06, + "loss": 0.9161, + "step": 16360 + }, + { + "epoch": 0.94, + "grad_norm": 0.2490234375, + "learning_rate": 2.258010835423774e-06, + "loss": 0.9082, + "step": 16365 + }, + { + "epoch": 0.94, + "grad_norm": 0.255859375, + "learning_rate": 2.2368982866245225e-06, + "loss": 0.8977, + "step": 16370 + }, + { + "epoch": 0.94, + "grad_norm": 0.25390625, + "learning_rate": 2.215883787384032e-06, + "loss": 0.8715, + "step": 16375 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 2.194967358778366e-06, + "loss": 0.9551, + "step": 16380 + }, + { + "epoch": 0.94, + "grad_norm": 0.26953125, + "learning_rate": 2.1741490217852545e-06, + "loss": 0.9018, + "step": 16385 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 2.1534287972840294e-06, + "loss": 0.9079, + "step": 16390 + }, + { + "epoch": 0.94, + "grad_norm": 0.2578125, + "learning_rate": 2.1328067060556235e-06, + "loss": 0.8878, + "step": 16395 + }, + { + "epoch": 0.94, + "grad_norm": 0.2578125, + "learning_rate": 2.1122827687825473e-06, + "loss": 0.8754, + "step": 16400 + }, + { + "epoch": 0.94, + "grad_norm": 0.271484375, + "learning_rate": 2.091857006048903e-06, + "loss": 0.9388, + "step": 16405 + }, + { + "epoch": 0.94, + "grad_norm": 0.267578125, + "learning_rate": 2.0715294383402695e-06, + "loss": 0.955, + "step": 16410 + }, + { + "epoch": 0.94, + "grad_norm": 0.28125, + "learning_rate": 2.051300086043806e-06, + "loss": 0.9801, + "step": 16415 + }, + { + "epoch": 0.94, + "grad_norm": 0.259765625, + "learning_rate": 2.031168969448116e-06, + "loss": 0.9802, + "step": 16420 + }, + { + "epoch": 0.94, + "grad_norm": 0.259765625, + "learning_rate": 2.0111361087433165e-06, + "loss": 0.938, + "step": 16425 + }, + { + "epoch": 0.94, + "grad_norm": 0.2490234375, + "learning_rate": 1.9912015240209583e-06, + "loss": 0.9356, + "step": 16430 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 1.9713652352740364e-06, + "loss": 0.9112, + "step": 16435 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 1.951627262396971e-06, + "loss": 0.9399, + "step": 16440 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 1.9319876251855606e-06, + "loss": 0.9381, + "step": 16445 + }, + { + "epoch": 0.94, + "grad_norm": 0.2734375, + "learning_rate": 1.9124463433370045e-06, + "loss": 0.9783, + "step": 16450 + }, + { + "epoch": 0.94, + "grad_norm": 0.263671875, + "learning_rate": 1.893003436449814e-06, + "loss": 0.9608, + "step": 16455 + }, + { + "epoch": 0.94, + "grad_norm": 0.263671875, + "learning_rate": 1.8736589240239022e-06, + "loss": 0.955, + "step": 16460 + }, + { + "epoch": 0.94, + "grad_norm": 0.251953125, + "learning_rate": 1.8544128254604277e-06, + "loss": 0.9459, + "step": 16465 + }, + { + "epoch": 0.95, + "grad_norm": 0.263671875, + "learning_rate": 1.8352651600619165e-06, + "loss": 0.9227, + "step": 16470 + }, + { + "epoch": 0.95, + "grad_norm": 0.25390625, + "learning_rate": 1.8162159470321072e-06, + "loss": 0.9127, + "step": 16475 + }, + { + "epoch": 0.95, + "grad_norm": 0.259765625, + "learning_rate": 1.7972652054760508e-06, + "loss": 0.9188, + "step": 16480 + }, + { + "epoch": 0.95, + "grad_norm": 0.2734375, + "learning_rate": 1.7784129544000106e-06, + "loss": 0.8966, + "step": 16485 + }, + { + "epoch": 0.95, + "grad_norm": 0.2734375, + "learning_rate": 1.7596592127114619e-06, + "loss": 0.9293, + "step": 16490 + }, + { + "epoch": 0.95, + "grad_norm": 0.275390625, + "learning_rate": 1.741003999219115e-06, + "loss": 0.989, + "step": 16495 + }, + { + "epoch": 0.95, + "grad_norm": 0.251953125, + "learning_rate": 1.7224473326328484e-06, + "loss": 0.9269, + "step": 16500 + }, + { + "epoch": 0.95, + "grad_norm": 0.2578125, + "learning_rate": 1.703989231563685e-06, + "loss": 0.9396, + "step": 16505 + }, + { + "epoch": 0.95, + "grad_norm": 0.255859375, + "learning_rate": 1.6856297145238177e-06, + "loss": 0.9334, + "step": 16510 + }, + { + "epoch": 0.95, + "grad_norm": 0.263671875, + "learning_rate": 1.667368799926572e-06, + "loss": 0.8955, + "step": 16515 + }, + { + "epoch": 0.95, + "grad_norm": 0.251953125, + "learning_rate": 1.6492065060863536e-06, + "loss": 0.9297, + "step": 16520 + }, + { + "epoch": 0.95, + "grad_norm": 0.2578125, + "learning_rate": 1.6311428512186699e-06, + "loss": 0.9916, + "step": 16525 + }, + { + "epoch": 0.95, + "grad_norm": 0.3125, + "learning_rate": 1.6131778534401176e-06, + "loss": 0.9735, + "step": 16530 + }, + { + "epoch": 0.95, + "grad_norm": 0.25390625, + "learning_rate": 1.5953115307683287e-06, + "loss": 0.9515, + "step": 16535 + }, + { + "epoch": 0.95, + "grad_norm": 0.265625, + "learning_rate": 1.5775439011219806e-06, + "loss": 0.9161, + "step": 16540 + }, + { + "epoch": 0.95, + "grad_norm": 0.28125, + "learning_rate": 1.5598749823207525e-06, + "loss": 0.901, + "step": 16545 + }, + { + "epoch": 0.95, + "grad_norm": 0.26953125, + "learning_rate": 1.5423047920853473e-06, + "loss": 0.9621, + "step": 16550 + }, + { + "epoch": 0.95, + "grad_norm": 0.2431640625, + "learning_rate": 1.5248333480374355e-06, + "loss": 0.9058, + "step": 16555 + }, + { + "epoch": 0.95, + "grad_norm": 0.279296875, + "learning_rate": 1.5074606676996561e-06, + "loss": 0.9201, + "step": 16560 + }, + { + "epoch": 0.95, + "grad_norm": 0.291015625, + "learning_rate": 1.4901867684955829e-06, + "loss": 0.9491, + "step": 16565 + }, + { + "epoch": 0.95, + "grad_norm": 0.259765625, + "learning_rate": 1.4730116677497351e-06, + "loss": 0.9164, + "step": 16570 + }, + { + "epoch": 0.95, + "grad_norm": 0.2412109375, + "learning_rate": 1.4559353826875344e-06, + "loss": 0.9667, + "step": 16575 + }, + { + "epoch": 0.95, + "grad_norm": 0.24609375, + "learning_rate": 1.438957930435314e-06, + "loss": 0.8973, + "step": 16580 + }, + { + "epoch": 0.95, + "grad_norm": 0.251953125, + "learning_rate": 1.422079328020265e-06, + "loss": 0.9339, + "step": 16585 + }, + { + "epoch": 0.95, + "grad_norm": 0.26953125, + "learning_rate": 1.405299592370435e-06, + "loss": 0.9648, + "step": 16590 + }, + { + "epoch": 0.95, + "grad_norm": 0.265625, + "learning_rate": 1.388618740314751e-06, + "loss": 0.9488, + "step": 16595 + }, + { + "epoch": 0.95, + "grad_norm": 0.2431640625, + "learning_rate": 1.37203678858292e-06, + "loss": 0.9638, + "step": 16600 + }, + { + "epoch": 0.95, + "grad_norm": 0.263671875, + "learning_rate": 1.355553753805483e-06, + "loss": 1.001, + "step": 16605 + }, + { + "epoch": 0.95, + "grad_norm": 0.275390625, + "learning_rate": 1.3391696525137831e-06, + "loss": 0.9786, + "step": 16610 + }, + { + "epoch": 0.95, + "grad_norm": 0.236328125, + "learning_rate": 1.3228845011399204e-06, + "loss": 0.9358, + "step": 16615 + }, + { + "epoch": 0.95, + "grad_norm": 0.2734375, + "learning_rate": 1.3066983160167746e-06, + "loss": 1.0021, + "step": 16620 + }, + { + "epoch": 0.95, + "grad_norm": 0.265625, + "learning_rate": 1.2906111133779376e-06, + "loss": 0.9945, + "step": 16625 + }, + { + "epoch": 0.95, + "grad_norm": 0.251953125, + "learning_rate": 1.27462290935777e-06, + "loss": 0.928, + "step": 16630 + }, + { + "epoch": 0.95, + "grad_norm": 0.26171875, + "learning_rate": 1.2587337199913118e-06, + "loss": 0.843, + "step": 16635 + }, + { + "epoch": 0.95, + "grad_norm": 0.255859375, + "learning_rate": 1.2429435612143158e-06, + "loss": 0.9762, + "step": 16640 + }, + { + "epoch": 0.96, + "grad_norm": 0.251953125, + "learning_rate": 1.227252448863192e-06, + "loss": 0.9254, + "step": 16645 + }, + { + "epoch": 0.96, + "grad_norm": 0.263671875, + "learning_rate": 1.211660398675052e-06, + "loss": 1.0486, + "step": 16650 + }, + { + "epoch": 0.96, + "grad_norm": 0.26171875, + "learning_rate": 1.1961674262876199e-06, + "loss": 0.9478, + "step": 16655 + }, + { + "epoch": 0.96, + "grad_norm": 0.251953125, + "learning_rate": 1.1807735472392778e-06, + "loss": 0.916, + "step": 16660 + }, + { + "epoch": 0.96, + "grad_norm": 0.271484375, + "learning_rate": 1.1654787769689868e-06, + "loss": 0.9788, + "step": 16665 + }, + { + "epoch": 0.96, + "grad_norm": 0.2734375, + "learning_rate": 1.150283130816343e-06, + "loss": 0.9357, + "step": 16670 + }, + { + "epoch": 0.96, + "grad_norm": 0.2578125, + "learning_rate": 1.1351866240215336e-06, + "loss": 0.9498, + "step": 16675 + }, + { + "epoch": 0.96, + "grad_norm": 0.275390625, + "learning_rate": 1.1201892717252692e-06, + "loss": 0.9669, + "step": 16680 + }, + { + "epoch": 0.96, + "grad_norm": 0.287109375, + "learning_rate": 1.1052910889688629e-06, + "loss": 0.9039, + "step": 16685 + }, + { + "epoch": 0.96, + "grad_norm": 0.271484375, + "learning_rate": 1.0904920906941618e-06, + "loss": 0.964, + "step": 16690 + }, + { + "epoch": 0.96, + "grad_norm": 0.267578125, + "learning_rate": 1.075792291743516e-06, + "loss": 0.8932, + "step": 16695 + }, + { + "epoch": 0.96, + "grad_norm": 0.275390625, + "learning_rate": 1.0611917068597877e-06, + "loss": 0.9485, + "step": 16700 + }, + { + "epoch": 0.96, + "grad_norm": 0.2578125, + "learning_rate": 1.0466903506863523e-06, + "loss": 0.9478, + "step": 16705 + }, + { + "epoch": 0.96, + "grad_norm": 0.267578125, + "learning_rate": 1.032288237767065e-06, + "loss": 0.9896, + "step": 16710 + }, + { + "epoch": 0.96, + "grad_norm": 0.263671875, + "learning_rate": 1.0179853825462271e-06, + "loss": 0.9185, + "step": 16715 + }, + { + "epoch": 0.96, + "grad_norm": 0.263671875, + "learning_rate": 1.0037817993686084e-06, + "loss": 0.9289, + "step": 16720 + }, + { + "epoch": 0.96, + "grad_norm": 0.265625, + "learning_rate": 9.896775024793914e-07, + "loss": 0.9296, + "step": 16725 + }, + { + "epoch": 0.96, + "grad_norm": 0.2392578125, + "learning_rate": 9.756725060242277e-07, + "loss": 0.9143, + "step": 16730 + }, + { + "epoch": 0.96, + "grad_norm": 0.26171875, + "learning_rate": 9.617668240491372e-07, + "loss": 1.0418, + "step": 16735 + }, + { + "epoch": 0.96, + "grad_norm": 0.28125, + "learning_rate": 9.479604705005529e-07, + "loss": 0.9322, + "step": 16740 + }, + { + "epoch": 0.96, + "grad_norm": 0.259765625, + "learning_rate": 9.342534592252761e-07, + "loss": 0.9627, + "step": 16745 + }, + { + "epoch": 0.96, + "grad_norm": 0.244140625, + "learning_rate": 9.206458039704768e-07, + "loss": 0.9865, + "step": 16750 + }, + { + "epoch": 0.96, + "grad_norm": 0.240234375, + "learning_rate": 9.07137518383705e-07, + "loss": 0.9431, + "step": 16755 + }, + { + "epoch": 0.96, + "grad_norm": 0.251953125, + "learning_rate": 8.9372861601279e-07, + "loss": 0.9944, + "step": 16760 + }, + { + "epoch": 0.96, + "grad_norm": 0.26953125, + "learning_rate": 8.804191103059523e-07, + "loss": 0.9314, + "step": 16765 + }, + { + "epoch": 0.96, + "grad_norm": 0.263671875, + "learning_rate": 8.672090146116917e-07, + "loss": 0.9187, + "step": 16770 + }, + { + "epoch": 0.96, + "grad_norm": 0.26953125, + "learning_rate": 8.540983421787996e-07, + "loss": 0.976, + "step": 16775 + }, + { + "epoch": 0.96, + "grad_norm": 0.2373046875, + "learning_rate": 8.410871061563797e-07, + "loss": 0.9552, + "step": 16780 + }, + { + "epoch": 0.96, + "grad_norm": 0.240234375, + "learning_rate": 8.281753195937714e-07, + "loss": 0.9202, + "step": 16785 + }, + { + "epoch": 0.96, + "grad_norm": 0.259765625, + "learning_rate": 8.153629954406161e-07, + "loss": 0.8923, + "step": 16790 + }, + { + "epoch": 0.96, + "grad_norm": 0.244140625, + "learning_rate": 8.026501465467684e-07, + "loss": 1.0348, + "step": 16795 + }, + { + "epoch": 0.96, + "grad_norm": 0.2578125, + "learning_rate": 7.900367856623403e-07, + "loss": 0.8979, + "step": 16800 + }, + { + "epoch": 0.96, + "grad_norm": 0.248046875, + "learning_rate": 7.775229254376348e-07, + "loss": 0.8703, + "step": 16805 + }, + { + "epoch": 0.96, + "grad_norm": 0.25, + "learning_rate": 7.651085784231793e-07, + "loss": 0.9397, + "step": 16810 + }, + { + "epoch": 0.96, + "grad_norm": 0.2578125, + "learning_rate": 7.52793757069703e-07, + "loss": 0.9422, + "step": 16815 + }, + { + "epoch": 0.97, + "grad_norm": 0.27734375, + "learning_rate": 7.405784737281151e-07, + "loss": 0.991, + "step": 16820 + }, + { + "epoch": 0.97, + "grad_norm": 0.25, + "learning_rate": 7.284627406494826e-07, + "loss": 0.9515, + "step": 16825 + }, + { + "epoch": 0.97, + "grad_norm": 0.25390625, + "learning_rate": 7.164465699850409e-07, + "loss": 0.9394, + "step": 16830 + }, + { + "epoch": 0.97, + "grad_norm": 0.26953125, + "learning_rate": 7.045299737861832e-07, + "loss": 0.9742, + "step": 16835 + }, + { + "epoch": 0.97, + "grad_norm": 0.263671875, + "learning_rate": 6.92712964004405e-07, + "loss": 0.949, + "step": 16840 + }, + { + "epoch": 0.97, + "grad_norm": 0.2578125, + "learning_rate": 6.809955524913369e-07, + "loss": 0.9727, + "step": 16845 + }, + { + "epoch": 0.97, + "grad_norm": 0.2578125, + "learning_rate": 6.693777509987453e-07, + "loss": 0.9082, + "step": 16850 + }, + { + "epoch": 0.97, + "grad_norm": 0.248046875, + "learning_rate": 6.578595711784541e-07, + "loss": 0.9473, + "step": 16855 + }, + { + "epoch": 0.97, + "grad_norm": 0.2421875, + "learning_rate": 6.464410245824004e-07, + "loss": 1.002, + "step": 16860 + }, + { + "epoch": 0.97, + "grad_norm": 0.251953125, + "learning_rate": 6.351221226625903e-07, + "loss": 0.99, + "step": 16865 + }, + { + "epoch": 0.97, + "grad_norm": 0.251953125, + "learning_rate": 6.239028767710986e-07, + "loss": 0.992, + "step": 16870 + }, + { + "epoch": 0.97, + "grad_norm": 0.26953125, + "learning_rate": 6.127832981600246e-07, + "loss": 0.8812, + "step": 16875 + }, + { + "epoch": 0.97, + "grad_norm": 0.25390625, + "learning_rate": 6.017633979815363e-07, + "loss": 0.9415, + "step": 16880 + }, + { + "epoch": 0.97, + "grad_norm": 0.265625, + "learning_rate": 5.908431872878372e-07, + "loss": 0.911, + "step": 16885 + }, + { + "epoch": 0.97, + "grad_norm": 0.2578125, + "learning_rate": 5.800226770311113e-07, + "loss": 0.9454, + "step": 16890 + }, + { + "epoch": 0.97, + "grad_norm": 0.25390625, + "learning_rate": 5.693018780635995e-07, + "loss": 1.0156, + "step": 16895 + }, + { + "epoch": 0.97, + "grad_norm": 0.259765625, + "learning_rate": 5.58680801137501e-07, + "loss": 0.9276, + "step": 16900 + }, + { + "epoch": 0.97, + "grad_norm": 0.25, + "learning_rate": 5.481594569050174e-07, + "loss": 0.8796, + "step": 16905 + }, + { + "epoch": 0.97, + "grad_norm": 0.2890625, + "learning_rate": 5.377378559183077e-07, + "loss": 0.9183, + "step": 16910 + }, + { + "epoch": 0.97, + "grad_norm": 0.259765625, + "learning_rate": 5.274160086295332e-07, + "loss": 0.9421, + "step": 16915 + }, + { + "epoch": 0.97, + "grad_norm": 0.2470703125, + "learning_rate": 5.171939253907687e-07, + "loss": 0.9335, + "step": 16920 + }, + { + "epoch": 0.97, + "grad_norm": 0.2578125, + "learning_rate": 5.070716164540579e-07, + "loss": 0.9423, + "step": 16925 + }, + { + "epoch": 0.97, + "grad_norm": 0.267578125, + "learning_rate": 4.970490919713577e-07, + "loss": 0.9731, + "step": 16930 + }, + { + "epoch": 0.97, + "grad_norm": 0.26171875, + "learning_rate": 4.871263619945721e-07, + "loss": 0.9762, + "step": 16935 + }, + { + "epoch": 0.97, + "grad_norm": 0.2490234375, + "learning_rate": 4.773034364754958e-07, + "loss": 0.9654, + "step": 16940 + }, + { + "epoch": 0.97, + "grad_norm": 0.26171875, + "learning_rate": 4.675803252658484e-07, + "loss": 0.9329, + "step": 16945 + }, + { + "epoch": 0.97, + "grad_norm": 0.259765625, + "learning_rate": 4.5795703811721825e-07, + "loss": 0.9249, + "step": 16950 + }, + { + "epoch": 0.97, + "grad_norm": 0.2392578125, + "learning_rate": 4.484335846810961e-07, + "loss": 0.8427, + "step": 16955 + }, + { + "epoch": 0.97, + "grad_norm": 0.251953125, + "learning_rate": 4.3900997450885274e-07, + "loss": 0.8893, + "step": 16960 + }, + { + "epoch": 0.97, + "grad_norm": 0.2373046875, + "learning_rate": 4.2968621705168354e-07, + "loss": 0.8505, + "step": 16965 + }, + { + "epoch": 0.97, + "grad_norm": 0.27734375, + "learning_rate": 4.204623216606751e-07, + "loss": 0.9157, + "step": 16970 + }, + { + "epoch": 0.97, + "grad_norm": 0.248046875, + "learning_rate": 4.113382975867608e-07, + "loss": 0.9722, + "step": 16975 + }, + { + "epoch": 0.97, + "grad_norm": 0.2578125, + "learning_rate": 4.023141539806985e-07, + "loss": 0.9496, + "step": 16980 + }, + { + "epoch": 0.97, + "grad_norm": 0.279296875, + "learning_rate": 3.9338989989307073e-07, + "loss": 0.9032, + "step": 16985 + }, + { + "epoch": 0.97, + "grad_norm": 0.259765625, + "learning_rate": 3.845655442742624e-07, + "loss": 0.9208, + "step": 16990 + }, + { + "epoch": 0.98, + "grad_norm": 0.263671875, + "learning_rate": 3.7584109597451623e-07, + "loss": 0.9329, + "step": 16995 + }, + { + "epoch": 0.98, + "grad_norm": 0.283203125, + "learning_rate": 3.672165637438218e-07, + "loss": 0.9515, + "step": 17000 + }, + { + "epoch": 0.98, + "grad_norm": 0.28125, + "learning_rate": 3.586919562319935e-07, + "loss": 0.9471, + "step": 17005 + }, + { + "epoch": 0.98, + "grad_norm": 0.26953125, + "learning_rate": 3.5026728198860324e-07, + "loss": 0.9335, + "step": 17010 + }, + { + "epoch": 0.98, + "grad_norm": 0.2470703125, + "learning_rate": 3.4194254946302573e-07, + "loss": 0.9268, + "step": 17015 + }, + { + "epoch": 0.98, + "grad_norm": 0.23828125, + "learning_rate": 3.337177670043823e-07, + "loss": 0.9364, + "step": 17020 + }, + { + "epoch": 0.98, + "grad_norm": 0.259765625, + "learning_rate": 3.255929428615523e-07, + "loss": 0.8839, + "step": 17025 + }, + { + "epoch": 0.98, + "grad_norm": 0.2734375, + "learning_rate": 3.175680851831619e-07, + "loss": 0.9537, + "step": 17030 + }, + { + "epoch": 0.98, + "grad_norm": 0.24609375, + "learning_rate": 3.0964320201759545e-07, + "loss": 0.9085, + "step": 17035 + }, + { + "epoch": 0.98, + "grad_norm": 0.2578125, + "learning_rate": 3.0181830131295053e-07, + "loss": 0.9741, + "step": 17040 + }, + { + "epoch": 0.98, + "grad_norm": 0.2490234375, + "learning_rate": 2.9409339091703844e-07, + "loss": 0.9369, + "step": 17045 + }, + { + "epoch": 0.98, + "grad_norm": 0.26171875, + "learning_rate": 2.8646847857742854e-07, + "loss": 0.9125, + "step": 17050 + }, + { + "epoch": 0.98, + "grad_norm": 0.2578125, + "learning_rate": 2.789435719413813e-07, + "loss": 0.9309, + "step": 17055 + }, + { + "epoch": 0.98, + "grad_norm": 0.283203125, + "learning_rate": 2.7151867855581546e-07, + "loss": 0.9311, + "step": 17060 + }, + { + "epoch": 0.98, + "grad_norm": 0.283203125, + "learning_rate": 2.641938058674187e-07, + "loss": 0.9312, + "step": 17065 + }, + { + "epoch": 0.98, + "grad_norm": 0.267578125, + "learning_rate": 2.569689612225035e-07, + "loss": 0.9185, + "step": 17070 + }, + { + "epoch": 0.98, + "grad_norm": 0.279296875, + "learning_rate": 2.4984415186709576e-07, + "loss": 0.9021, + "step": 17075 + }, + { + "epoch": 0.98, + "grad_norm": 0.263671875, + "learning_rate": 2.4281938494686853e-07, + "loss": 0.9878, + "step": 17080 + }, + { + "epoch": 0.98, + "grad_norm": 0.255859375, + "learning_rate": 2.3589466750718604e-07, + "loss": 0.935, + "step": 17085 + }, + { + "epoch": 0.98, + "grad_norm": 0.259765625, + "learning_rate": 2.2907000649304845e-07, + "loss": 0.9045, + "step": 17090 + }, + { + "epoch": 0.98, + "grad_norm": 0.27734375, + "learning_rate": 2.2234540874911392e-07, + "loss": 0.9137, + "step": 17095 + }, + { + "epoch": 0.98, + "grad_norm": 0.359375, + "learning_rate": 2.1572088101968758e-07, + "loss": 1.008, + "step": 17100 + }, + { + "epoch": 0.98, + "grad_norm": 0.255859375, + "learning_rate": 2.0919642994869925e-07, + "loss": 0.8877, + "step": 17105 + }, + { + "epoch": 0.98, + "grad_norm": 0.26953125, + "learning_rate": 2.0277206207972576e-07, + "loss": 0.9412, + "step": 17110 + }, + { + "epoch": 0.98, + "grad_norm": 0.29296875, + "learning_rate": 1.9644778385596864e-07, + "loss": 0.9598, + "step": 17115 + }, + { + "epoch": 0.98, + "grad_norm": 0.255859375, + "learning_rate": 1.902236016202208e-07, + "loss": 0.9411, + "step": 17120 + }, + { + "epoch": 0.98, + "grad_norm": 0.267578125, + "learning_rate": 1.8409952161489997e-07, + "loss": 0.9106, + "step": 17125 + }, + { + "epoch": 0.98, + "grad_norm": 0.265625, + "learning_rate": 1.7807554998203747e-07, + "loss": 0.8809, + "step": 17130 + }, + { + "epoch": 0.98, + "grad_norm": 0.2470703125, + "learning_rate": 1.7215169276325605e-07, + "loss": 0.9383, + "step": 17135 + }, + { + "epoch": 0.98, + "grad_norm": 0.25390625, + "learning_rate": 1.663279558997699e-07, + "loss": 0.9645, + "step": 17140 + }, + { + "epoch": 0.98, + "grad_norm": 0.265625, + "learning_rate": 1.6060434523238466e-07, + "loss": 0.8795, + "step": 17145 + }, + { + "epoch": 0.98, + "grad_norm": 0.2470703125, + "learning_rate": 1.5498086650147513e-07, + "loss": 0.9248, + "step": 17150 + }, + { + "epoch": 0.98, + "grad_norm": 0.2734375, + "learning_rate": 1.4945752534699653e-07, + "loss": 0.9705, + "step": 17155 + }, + { + "epoch": 0.98, + "grad_norm": 0.267578125, + "learning_rate": 1.4403432730847323e-07, + "loss": 0.9542, + "step": 17160 + }, + { + "epoch": 0.98, + "grad_norm": 0.25390625, + "learning_rate": 1.3871127782500993e-07, + "loss": 0.9194, + "step": 17165 + }, + { + "epoch": 0.99, + "grad_norm": 0.267578125, + "learning_rate": 1.3348838223523618e-07, + "loss": 0.8496, + "step": 17170 + }, + { + "epoch": 0.99, + "grad_norm": 0.267578125, + "learning_rate": 1.2836564577735078e-07, + "loss": 0.8582, + "step": 17175 + }, + { + "epoch": 0.99, + "grad_norm": 0.234375, + "learning_rate": 1.2334307358911056e-07, + "loss": 0.9176, + "step": 17180 + }, + { + "epoch": 0.99, + "grad_norm": 0.267578125, + "learning_rate": 1.1842067070779728e-07, + "loss": 0.9633, + "step": 17185 + }, + { + "epoch": 0.99, + "grad_norm": 0.24609375, + "learning_rate": 1.1359844207023962e-07, + "loss": 0.8306, + "step": 17190 + }, + { + "epoch": 0.99, + "grad_norm": 0.26171875, + "learning_rate": 1.0887639251280224e-07, + "loss": 0.947, + "step": 17195 + }, + { + "epoch": 0.99, + "grad_norm": 0.265625, + "learning_rate": 1.0425452677135238e-07, + "loss": 0.9984, + "step": 17200 + }, + { + "epoch": 0.99, + "grad_norm": 0.275390625, + "learning_rate": 9.973284948132656e-08, + "loss": 0.9801, + "step": 17205 + }, + { + "epoch": 0.99, + "grad_norm": 0.2412109375, + "learning_rate": 9.531136517761941e-08, + "loss": 0.9025, + "step": 17210 + }, + { + "epoch": 0.99, + "grad_norm": 0.255859375, + "learning_rate": 9.099007829469486e-08, + "loss": 0.8792, + "step": 17215 + }, + { + "epoch": 0.99, + "grad_norm": 0.267578125, + "learning_rate": 8.676899316648613e-08, + "loss": 0.9554, + "step": 17220 + }, + { + "epoch": 0.99, + "grad_norm": 0.263671875, + "learning_rate": 8.264811402646233e-08, + "loss": 1.0131, + "step": 17225 + }, + { + "epoch": 0.99, + "grad_norm": 0.263671875, + "learning_rate": 7.862744500756192e-08, + "loss": 0.9268, + "step": 17230 + }, + { + "epoch": 0.99, + "grad_norm": 0.259765625, + "learning_rate": 7.470699014223703e-08, + "loss": 0.9519, + "step": 17235 + }, + { + "epoch": 0.99, + "grad_norm": 0.265625, + "learning_rate": 7.088675336244244e-08, + "loss": 0.9131, + "step": 17240 + }, + { + "epoch": 0.99, + "grad_norm": 0.265625, + "learning_rate": 6.71667384995911e-08, + "loss": 0.9502, + "step": 17245 + }, + { + "epoch": 0.99, + "grad_norm": 0.25390625, + "learning_rate": 6.35469492846208e-08, + "loss": 0.9628, + "step": 17250 + }, + { + "epoch": 0.99, + "grad_norm": 0.255859375, + "learning_rate": 6.00273893479053e-08, + "loss": 0.9062, + "step": 17255 + }, + { + "epoch": 0.99, + "grad_norm": 0.28515625, + "learning_rate": 5.660806221932102e-08, + "loss": 0.9433, + "step": 17260 + }, + { + "epoch": 0.99, + "grad_norm": 0.26171875, + "learning_rate": 5.3288971328224747e-08, + "loss": 0.9557, + "step": 17265 + }, + { + "epoch": 0.99, + "grad_norm": 0.259765625, + "learning_rate": 5.0070120003420375e-08, + "loss": 0.9473, + "step": 17270 + }, + { + "epoch": 0.99, + "grad_norm": 0.2578125, + "learning_rate": 4.6951511473203316e-08, + "loss": 0.9675, + "step": 17275 + }, + { + "epoch": 0.99, + "grad_norm": 0.259765625, + "learning_rate": 4.3933148865316075e-08, + "loss": 0.9835, + "step": 17280 + }, + { + "epoch": 0.99, + "grad_norm": 0.25, + "learning_rate": 4.101503520695937e-08, + "loss": 0.9784, + "step": 17285 + }, + { + "epoch": 0.99, + "grad_norm": 0.26171875, + "learning_rate": 3.819717342480322e-08, + "loss": 0.9332, + "step": 17290 + }, + { + "epoch": 0.99, + "grad_norm": 0.24609375, + "learning_rate": 3.547956634495364e-08, + "loss": 0.976, + "step": 17295 + }, + { + "epoch": 0.99, + "grad_norm": 0.244140625, + "learning_rate": 3.286221669299705e-08, + "loss": 0.9327, + "step": 17300 + }, + { + "epoch": 0.99, + "grad_norm": 0.2578125, + "learning_rate": 3.0345127093955875e-08, + "loss": 0.8962, + "step": 17305 + }, + { + "epoch": 0.99, + "grad_norm": 0.2431640625, + "learning_rate": 2.7928300072277424e-08, + "loss": 0.8845, + "step": 17310 + }, + { + "epoch": 0.99, + "grad_norm": 0.28125, + "learning_rate": 2.561173805186723e-08, + "loss": 0.9619, + "step": 17315 + }, + { + "epoch": 0.99, + "grad_norm": 0.25390625, + "learning_rate": 2.339544335610011e-08, + "loss": 0.9725, + "step": 17320 + }, + { + "epoch": 0.99, + "grad_norm": 0.255859375, + "learning_rate": 2.1279418207742486e-08, + "loss": 0.9017, + "step": 17325 + }, + { + "epoch": 0.99, + "grad_norm": 0.25390625, + "learning_rate": 1.9263664729030073e-08, + "loss": 0.8888, + "step": 17330 + }, + { + "epoch": 0.99, + "grad_norm": 0.259765625, + "learning_rate": 1.7348184941623492e-08, + "loss": 0.9509, + "step": 17335 + }, + { + "epoch": 0.99, + "grad_norm": 0.263671875, + "learning_rate": 1.5532980766608252e-08, + "loss": 1.0119, + "step": 17340 + }, + { + "epoch": 1.0, + "grad_norm": 0.26171875, + "learning_rate": 1.3818054024516969e-08, + "loss": 0.9384, + "step": 17345 + }, + { + "epoch": 1.0, + "grad_norm": 0.28125, + "learning_rate": 1.2203406435284948e-08, + "loss": 0.9307, + "step": 17350 + }, + { + "epoch": 1.0, + "grad_norm": 0.244140625, + "learning_rate": 1.0689039618305696e-08, + "loss": 0.9041, + "step": 17355 + }, + { + "epoch": 1.0, + "grad_norm": 0.255859375, + "learning_rate": 9.274955092386516e-09, + "loss": 0.9935, + "step": 17360 + }, + { + "epoch": 1.0, + "grad_norm": 0.255859375, + "learning_rate": 7.961154275737403e-09, + "loss": 0.9374, + "step": 17365 + }, + { + "epoch": 1.0, + "grad_norm": 0.2353515625, + "learning_rate": 6.747638486026553e-09, + "loss": 0.91, + "step": 17370 + }, + { + "epoch": 1.0, + "grad_norm": 0.287109375, + "learning_rate": 5.634408940313751e-09, + "loss": 0.997, + "step": 17375 + }, + { + "epoch": 1.0, + "grad_norm": 0.28515625, + "learning_rate": 4.621466755094784e-09, + "loss": 0.9748, + "step": 17380 + }, + { + "epoch": 1.0, + "grad_norm": 0.255859375, + "learning_rate": 3.7088129462792277e-09, + "loss": 0.9361, + "step": 17385 + }, + { + "epoch": 1.0, + "grad_norm": 0.267578125, + "learning_rate": 2.896448429201559e-09, + "loss": 0.9747, + "step": 17390 + }, + { + "epoch": 1.0, + "grad_norm": 0.2890625, + "learning_rate": 2.1843740185878423e-09, + "loss": 0.9583, + "step": 17395 + }, + { + "epoch": 1.0, + "grad_norm": 0.251953125, + "learning_rate": 1.5725904286223446e-09, + "loss": 0.9044, + "step": 17400 + }, + { + "epoch": 1.0, + "grad_norm": 0.251953125, + "learning_rate": 1.0610982728698203e-09, + "loss": 0.9078, + "step": 17405 + }, + { + "epoch": 1.0, + "grad_norm": 0.26953125, + "learning_rate": 6.498980643199204e-10, + "loss": 0.9411, + "step": 17410 + }, + { + "epoch": 1.0, + "grad_norm": 0.263671875, + "learning_rate": 3.389902153760893e-10, + "loss": 0.9685, + "step": 17415 + }, + { + "epoch": 1.0, + "grad_norm": 0.26171875, + "learning_rate": 1.2837503786666815e-10, + "loss": 0.937, + "step": 17420 + }, + { + "epoch": 1.0, + "grad_norm": 0.283203125, + "learning_rate": 1.8052743022689556e-11, + "loss": 0.9411, + "step": 17425 + }, + { + "epoch": 1.0, + "eval_loss": 0.9471394419670105, + "eval_runtime": 7952.5701, + "eval_samples_per_second": 1.94, + "eval_steps_per_second": 0.243, + "step": 17428 + }, + { + "epoch": 1.0, + "step": 17428, + "total_flos": 1.2254141370633028e+19, + "train_loss": 0.9532137394142939, + "train_runtime": 251062.1903, + "train_samples_per_second": 0.555, + "train_steps_per_second": 0.069 + } + ], + "logging_steps": 5, + "max_steps": 17428, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.2254141370633028e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}