diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17125 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.990723562152134, + "eval_steps": 500, + "global_step": 12120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012368583797155227, + "grad_norm": 3.171875, + "learning_rate": 1.6501650165016504e-07, + "loss": 2.8095, + "step": 1 + }, + { + "epoch": 0.006184291898577613, + "grad_norm": 3.1875, + "learning_rate": 8.25082508250825e-07, + "loss": 2.8204, + "step": 5 + }, + { + "epoch": 0.012368583797155226, + "grad_norm": 1.9140625, + "learning_rate": 1.65016501650165e-06, + "loss": 2.7934, + "step": 10 + }, + { + "epoch": 0.01855287569573284, + "grad_norm": 2.171875, + "learning_rate": 2.4752475247524753e-06, + "loss": 2.7932, + "step": 15 + }, + { + "epoch": 0.024737167594310452, + "grad_norm": 1.9921875, + "learning_rate": 3.3003300330033e-06, + "loss": 2.842, + "step": 20 + }, + { + "epoch": 0.030921459492888066, + "grad_norm": 1.96875, + "learning_rate": 4.125412541254126e-06, + "loss": 2.8041, + "step": 25 + }, + { + "epoch": 0.03710575139146568, + "grad_norm": 27.5, + "learning_rate": 4.950495049504951e-06, + "loss": 2.7951, + "step": 30 + }, + { + "epoch": 0.04329004329004329, + "grad_norm": 1.546875, + "learning_rate": 5.775577557755775e-06, + "loss": 2.8127, + "step": 35 + }, + { + "epoch": 0.049474335188620905, + "grad_norm": 1.8671875, + "learning_rate": 6.6006600660066e-06, + "loss": 2.8001, + "step": 40 + }, + { + "epoch": 0.055658627087198514, + "grad_norm": 1.625, + "learning_rate": 7.4257425742574256e-06, + "loss": 2.76, + "step": 45 + }, + { + "epoch": 0.06184291898577613, + "grad_norm": 1.5546875, + "learning_rate": 8.250825082508252e-06, + "loss": 2.7348, + "step": 50 + }, + { + "epoch": 0.06802721088435375, + "grad_norm": 2.765625, + "learning_rate": 9.075907590759077e-06, + "loss": 2.7171, + "step": 55 + }, + { + "epoch": 0.07421150278293136, + "grad_norm": 1.4140625, + "learning_rate": 9.900990099009901e-06, + "loss": 2.6948, + "step": 60 + }, + { + "epoch": 0.08039579468150897, + "grad_norm": 1.5546875, + "learning_rate": 1.0726072607260726e-05, + "loss": 2.6657, + "step": 65 + }, + { + "epoch": 0.08658008658008658, + "grad_norm": 1.3359375, + "learning_rate": 1.155115511551155e-05, + "loss": 2.6411, + "step": 70 + }, + { + "epoch": 0.09276437847866419, + "grad_norm": 2.8125, + "learning_rate": 1.2376237623762377e-05, + "loss": 2.5744, + "step": 75 + }, + { + "epoch": 0.09894867037724181, + "grad_norm": 3.0, + "learning_rate": 1.32013201320132e-05, + "loss": 2.5252, + "step": 80 + }, + { + "epoch": 0.10513296227581942, + "grad_norm": 1.4921875, + "learning_rate": 1.4026402640264028e-05, + "loss": 2.4702, + "step": 85 + }, + { + "epoch": 0.11131725417439703, + "grad_norm": 1.453125, + "learning_rate": 1.4851485148514851e-05, + "loss": 2.3871, + "step": 90 + }, + { + "epoch": 0.11750154607297464, + "grad_norm": 2.84375, + "learning_rate": 1.567656765676568e-05, + "loss": 2.3574, + "step": 95 + }, + { + "epoch": 0.12368583797155226, + "grad_norm": 1.6015625, + "learning_rate": 1.6501650165016504e-05, + "loss": 2.3075, + "step": 100 + }, + { + "epoch": 0.12987012987012986, + "grad_norm": 1.6484375, + "learning_rate": 1.7326732673267325e-05, + "loss": 2.2358, + "step": 105 + }, + { + "epoch": 0.1360544217687075, + "grad_norm": 2.25, + "learning_rate": 1.8151815181518153e-05, + "loss": 2.1817, + "step": 110 + }, + { + "epoch": 0.1422387136672851, + "grad_norm": 1.1328125, + "learning_rate": 1.8976897689768978e-05, + "loss": 2.1457, + "step": 115 + }, + { + "epoch": 0.14842300556586271, + "grad_norm": 0.7890625, + "learning_rate": 1.9801980198019803e-05, + "loss": 2.1015, + "step": 120 + }, + { + "epoch": 0.15460729746444032, + "grad_norm": 5.375, + "learning_rate": 2.0627062706270627e-05, + "loss": 2.0488, + "step": 125 + }, + { + "epoch": 0.16079158936301793, + "grad_norm": 1.0078125, + "learning_rate": 2.1452145214521452e-05, + "loss": 1.9967, + "step": 130 + }, + { + "epoch": 0.16697588126159554, + "grad_norm": 1.8359375, + "learning_rate": 2.227722772277228e-05, + "loss": 1.9499, + "step": 135 + }, + { + "epoch": 0.17316017316017315, + "grad_norm": 5.3125, + "learning_rate": 2.31023102310231e-05, + "loss": 1.9143, + "step": 140 + }, + { + "epoch": 0.17934446505875076, + "grad_norm": 5.625, + "learning_rate": 2.392739273927393e-05, + "loss": 1.8633, + "step": 145 + }, + { + "epoch": 0.18552875695732837, + "grad_norm": 1.9921875, + "learning_rate": 2.4752475247524754e-05, + "loss": 1.8392, + "step": 150 + }, + { + "epoch": 0.191713048855906, + "grad_norm": 1.859375, + "learning_rate": 2.557755775577558e-05, + "loss": 1.7943, + "step": 155 + }, + { + "epoch": 0.19789734075448362, + "grad_norm": 1.6015625, + "learning_rate": 2.64026402640264e-05, + "loss": 1.7733, + "step": 160 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 1.5, + "learning_rate": 2.722772277227723e-05, + "loss": 1.7403, + "step": 165 + }, + { + "epoch": 0.21026592455163884, + "grad_norm": 1.53125, + "learning_rate": 2.8052805280528056e-05, + "loss": 1.6989, + "step": 170 + }, + { + "epoch": 0.21645021645021645, + "grad_norm": 1.4453125, + "learning_rate": 2.8877887788778878e-05, + "loss": 1.6679, + "step": 175 + }, + { + "epoch": 0.22263450834879406, + "grad_norm": 1.59375, + "learning_rate": 2.9702970297029702e-05, + "loss": 1.629, + "step": 180 + }, + { + "epoch": 0.22881880024737167, + "grad_norm": 0.8671875, + "learning_rate": 3.052805280528053e-05, + "loss": 1.6264, + "step": 185 + }, + { + "epoch": 0.23500309214594928, + "grad_norm": 1.2109375, + "learning_rate": 3.135313531353136e-05, + "loss": 1.581, + "step": 190 + }, + { + "epoch": 0.24118738404452691, + "grad_norm": 0.435546875, + "learning_rate": 3.217821782178218e-05, + "loss": 1.5598, + "step": 195 + }, + { + "epoch": 0.24737167594310452, + "grad_norm": 0.8828125, + "learning_rate": 3.300330033003301e-05, + "loss": 1.5259, + "step": 200 + }, + { + "epoch": 0.2535559678416821, + "grad_norm": 0.38671875, + "learning_rate": 3.382838283828383e-05, + "loss": 1.4943, + "step": 205 + }, + { + "epoch": 0.2597402597402597, + "grad_norm": 0.59375, + "learning_rate": 3.465346534653465e-05, + "loss": 1.4984, + "step": 210 + }, + { + "epoch": 0.2659245516388373, + "grad_norm": 0.431640625, + "learning_rate": 3.5478547854785485e-05, + "loss": 1.4686, + "step": 215 + }, + { + "epoch": 0.272108843537415, + "grad_norm": 0.6484375, + "learning_rate": 3.6303630363036307e-05, + "loss": 1.4699, + "step": 220 + }, + { + "epoch": 0.2782931354359926, + "grad_norm": 1.1484375, + "learning_rate": 3.712871287128713e-05, + "loss": 1.4291, + "step": 225 + }, + { + "epoch": 0.2844774273345702, + "grad_norm": 0.4609375, + "learning_rate": 3.7953795379537956e-05, + "loss": 1.4235, + "step": 230 + }, + { + "epoch": 0.2906617192331478, + "grad_norm": 0.48828125, + "learning_rate": 3.877887788778878e-05, + "loss": 1.4041, + "step": 235 + }, + { + "epoch": 0.29684601113172543, + "grad_norm": 0.40234375, + "learning_rate": 3.9603960396039605e-05, + "loss": 1.3848, + "step": 240 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.90234375, + "learning_rate": 4.042904290429043e-05, + "loss": 1.3611, + "step": 245 + }, + { + "epoch": 0.30921459492888065, + "grad_norm": 0.314453125, + "learning_rate": 4.1254125412541255e-05, + "loss": 1.3558, + "step": 250 + }, + { + "epoch": 0.31539888682745826, + "grad_norm": 0.455078125, + "learning_rate": 4.207920792079208e-05, + "loss": 1.3451, + "step": 255 + }, + { + "epoch": 0.32158317872603587, + "grad_norm": 0.328125, + "learning_rate": 4.2904290429042904e-05, + "loss": 1.3326, + "step": 260 + }, + { + "epoch": 0.3277674706246135, + "grad_norm": 0.62890625, + "learning_rate": 4.372937293729373e-05, + "loss": 1.3084, + "step": 265 + }, + { + "epoch": 0.3339517625231911, + "grad_norm": 0.306640625, + "learning_rate": 4.455445544554456e-05, + "loss": 1.3091, + "step": 270 + }, + { + "epoch": 0.3401360544217687, + "grad_norm": 0.33984375, + "learning_rate": 4.537953795379538e-05, + "loss": 1.3078, + "step": 275 + }, + { + "epoch": 0.3463203463203463, + "grad_norm": 0.4296875, + "learning_rate": 4.62046204620462e-05, + "loss": 1.2944, + "step": 280 + }, + { + "epoch": 0.3525046382189239, + "grad_norm": 0.447265625, + "learning_rate": 4.702970297029703e-05, + "loss": 1.2689, + "step": 285 + }, + { + "epoch": 0.3586889301175015, + "grad_norm": 0.52734375, + "learning_rate": 4.785478547854786e-05, + "loss": 1.2907, + "step": 290 + }, + { + "epoch": 0.36487322201607914, + "grad_norm": 0.3515625, + "learning_rate": 4.867986798679868e-05, + "loss": 1.2734, + "step": 295 + }, + { + "epoch": 0.37105751391465674, + "grad_norm": 0.408203125, + "learning_rate": 4.950495049504951e-05, + "loss": 1.2533, + "step": 300 + }, + { + "epoch": 0.3772418058132344, + "grad_norm": 0.392578125, + "learning_rate": 5.0330033003300336e-05, + "loss": 1.2568, + "step": 305 + }, + { + "epoch": 0.383426097711812, + "grad_norm": 0.48828125, + "learning_rate": 5.115511551155116e-05, + "loss": 1.2476, + "step": 310 + }, + { + "epoch": 0.38961038961038963, + "grad_norm": 0.5078125, + "learning_rate": 5.1980198019801986e-05, + "loss": 1.2506, + "step": 315 + }, + { + "epoch": 0.39579468150896724, + "grad_norm": 0.337890625, + "learning_rate": 5.28052805280528e-05, + "loss": 1.2311, + "step": 320 + }, + { + "epoch": 0.40197897340754485, + "grad_norm": 0.357421875, + "learning_rate": 5.3630363036303635e-05, + "loss": 1.2302, + "step": 325 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.380859375, + "learning_rate": 5.445544554455446e-05, + "loss": 1.2182, + "step": 330 + }, + { + "epoch": 0.41434755720470007, + "grad_norm": 0.376953125, + "learning_rate": 5.528052805280528e-05, + "loss": 1.2322, + "step": 335 + }, + { + "epoch": 0.4205318491032777, + "grad_norm": 0.466796875, + "learning_rate": 5.610561056105611e-05, + "loss": 1.2181, + "step": 340 + }, + { + "epoch": 0.4267161410018553, + "grad_norm": 0.55078125, + "learning_rate": 5.693069306930693e-05, + "loss": 1.2136, + "step": 345 + }, + { + "epoch": 0.4329004329004329, + "grad_norm": 0.546875, + "learning_rate": 5.7755775577557755e-05, + "loss": 1.2089, + "step": 350 + }, + { + "epoch": 0.4390847247990105, + "grad_norm": 0.4375, + "learning_rate": 5.858085808580859e-05, + "loss": 1.197, + "step": 355 + }, + { + "epoch": 0.4452690166975881, + "grad_norm": 0.55859375, + "learning_rate": 5.9405940594059404e-05, + "loss": 1.1929, + "step": 360 + }, + { + "epoch": 0.4514533085961657, + "grad_norm": 0.55859375, + "learning_rate": 6.023102310231023e-05, + "loss": 1.1859, + "step": 365 + }, + { + "epoch": 0.45763760049474334, + "grad_norm": 0.443359375, + "learning_rate": 6.105610561056106e-05, + "loss": 1.1969, + "step": 370 + }, + { + "epoch": 0.46382189239332094, + "grad_norm": 0.46484375, + "learning_rate": 6.188118811881188e-05, + "loss": 1.1898, + "step": 375 + }, + { + "epoch": 0.47000618429189855, + "grad_norm": 0.3828125, + "learning_rate": 6.270627062706272e-05, + "loss": 1.1736, + "step": 380 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.458984375, + "learning_rate": 6.353135313531354e-05, + "loss": 1.1827, + "step": 385 + }, + { + "epoch": 0.48237476808905383, + "grad_norm": 0.4921875, + "learning_rate": 6.435643564356436e-05, + "loss": 1.1792, + "step": 390 + }, + { + "epoch": 0.48855905998763144, + "grad_norm": 0.375, + "learning_rate": 6.518151815181518e-05, + "loss": 1.1674, + "step": 395 + }, + { + "epoch": 0.49474335188620905, + "grad_norm": 0.349609375, + "learning_rate": 6.600660066006602e-05, + "loss": 1.1585, + "step": 400 + }, + { + "epoch": 0.5009276437847866, + "grad_norm": 0.333984375, + "learning_rate": 6.683168316831684e-05, + "loss": 1.1601, + "step": 405 + }, + { + "epoch": 0.5071119356833642, + "grad_norm": 0.400390625, + "learning_rate": 6.765676567656766e-05, + "loss": 1.1643, + "step": 410 + }, + { + "epoch": 0.5132962275819418, + "grad_norm": 0.404296875, + "learning_rate": 6.848184818481849e-05, + "loss": 1.1668, + "step": 415 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 0.4609375, + "learning_rate": 6.93069306930693e-05, + "loss": 1.1485, + "step": 420 + }, + { + "epoch": 0.525664811379097, + "grad_norm": 0.376953125, + "learning_rate": 7.013201320132014e-05, + "loss": 1.1561, + "step": 425 + }, + { + "epoch": 0.5318491032776747, + "grad_norm": 0.40625, + "learning_rate": 7.095709570957097e-05, + "loss": 1.1572, + "step": 430 + }, + { + "epoch": 0.5380333951762524, + "grad_norm": 0.41015625, + "learning_rate": 7.178217821782178e-05, + "loss": 1.1527, + "step": 435 + }, + { + "epoch": 0.54421768707483, + "grad_norm": 0.419921875, + "learning_rate": 7.260726072607261e-05, + "loss": 1.1559, + "step": 440 + }, + { + "epoch": 0.5504019789734076, + "grad_norm": 0.40234375, + "learning_rate": 7.343234323432343e-05, + "loss": 1.1587, + "step": 445 + }, + { + "epoch": 0.5565862708719852, + "grad_norm": 0.46484375, + "learning_rate": 7.425742574257426e-05, + "loss": 1.1394, + "step": 450 + }, + { + "epoch": 0.5627705627705628, + "grad_norm": 0.384765625, + "learning_rate": 7.508250825082509e-05, + "loss": 1.1451, + "step": 455 + }, + { + "epoch": 0.5689548546691404, + "grad_norm": 0.40625, + "learning_rate": 7.590759075907591e-05, + "loss": 1.1405, + "step": 460 + }, + { + "epoch": 0.575139146567718, + "grad_norm": 0.55859375, + "learning_rate": 7.673267326732673e-05, + "loss": 1.1397, + "step": 465 + }, + { + "epoch": 0.5813234384662956, + "grad_norm": 0.49609375, + "learning_rate": 7.755775577557755e-05, + "loss": 1.1213, + "step": 470 + }, + { + "epoch": 0.5875077303648732, + "grad_norm": 0.40234375, + "learning_rate": 7.838283828382839e-05, + "loss": 1.1313, + "step": 475 + }, + { + "epoch": 0.5936920222634509, + "grad_norm": 0.52734375, + "learning_rate": 7.920792079207921e-05, + "loss": 1.1268, + "step": 480 + }, + { + "epoch": 0.5998763141620285, + "grad_norm": 0.39453125, + "learning_rate": 8.003300330033003e-05, + "loss": 1.1137, + "step": 485 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.578125, + "learning_rate": 8.085808580858087e-05, + "loss": 1.1352, + "step": 490 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.4140625, + "learning_rate": 8.168316831683169e-05, + "loss": 1.1321, + "step": 495 + }, + { + "epoch": 0.6184291898577613, + "grad_norm": 0.396484375, + "learning_rate": 8.250825082508251e-05, + "loss": 1.116, + "step": 500 + }, + { + "epoch": 0.6246134817563389, + "grad_norm": 0.5703125, + "learning_rate": 8.333333333333334e-05, + "loss": 1.1233, + "step": 505 + }, + { + "epoch": 0.6307977736549165, + "grad_norm": 0.50390625, + "learning_rate": 8.415841584158417e-05, + "loss": 1.1291, + "step": 510 + }, + { + "epoch": 0.6369820655534941, + "grad_norm": 0.69140625, + "learning_rate": 8.498349834983499e-05, + "loss": 1.1307, + "step": 515 + }, + { + "epoch": 0.6431663574520717, + "grad_norm": 0.54296875, + "learning_rate": 8.580858085808581e-05, + "loss": 1.1188, + "step": 520 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.490234375, + "learning_rate": 8.663366336633664e-05, + "loss": 1.116, + "step": 525 + }, + { + "epoch": 0.655534941249227, + "grad_norm": 0.51953125, + "learning_rate": 8.745874587458746e-05, + "loss": 1.1128, + "step": 530 + }, + { + "epoch": 0.6617192331478046, + "grad_norm": 0.58203125, + "learning_rate": 8.828382838283829e-05, + "loss": 1.1046, + "step": 535 + }, + { + "epoch": 0.6679035250463822, + "grad_norm": 0.69921875, + "learning_rate": 8.910891089108912e-05, + "loss": 1.1184, + "step": 540 + }, + { + "epoch": 0.6740878169449598, + "grad_norm": 0.55859375, + "learning_rate": 8.993399339933993e-05, + "loss": 1.1039, + "step": 545 + }, + { + "epoch": 0.6802721088435374, + "grad_norm": 0.65234375, + "learning_rate": 9.075907590759076e-05, + "loss": 1.1097, + "step": 550 + }, + { + "epoch": 0.686456400742115, + "grad_norm": 0.486328125, + "learning_rate": 9.15841584158416e-05, + "loss": 1.1088, + "step": 555 + }, + { + "epoch": 0.6926406926406926, + "grad_norm": 0.85546875, + "learning_rate": 9.24092409240924e-05, + "loss": 1.0955, + "step": 560 + }, + { + "epoch": 0.6988249845392702, + "grad_norm": 0.57421875, + "learning_rate": 9.323432343234324e-05, + "loss": 1.1105, + "step": 565 + }, + { + "epoch": 0.7050092764378478, + "grad_norm": 0.6953125, + "learning_rate": 9.405940594059406e-05, + "loss": 1.1024, + "step": 570 + }, + { + "epoch": 0.7111935683364254, + "grad_norm": 0.5078125, + "learning_rate": 9.488448844884488e-05, + "loss": 1.0922, + "step": 575 + }, + { + "epoch": 0.717377860235003, + "grad_norm": 0.515625, + "learning_rate": 9.570957095709572e-05, + "loss": 1.0985, + "step": 580 + }, + { + "epoch": 0.7235621521335807, + "grad_norm": 0.5390625, + "learning_rate": 9.653465346534654e-05, + "loss": 1.0908, + "step": 585 + }, + { + "epoch": 0.7297464440321583, + "grad_norm": 0.60546875, + "learning_rate": 9.735973597359736e-05, + "loss": 1.0882, + "step": 590 + }, + { + "epoch": 0.7359307359307359, + "grad_norm": 0.451171875, + "learning_rate": 9.818481848184818e-05, + "loss": 1.102, + "step": 595 + }, + { + "epoch": 0.7421150278293135, + "grad_norm": 0.5625, + "learning_rate": 9.900990099009902e-05, + "loss": 1.0799, + "step": 600 + }, + { + "epoch": 0.7482993197278912, + "grad_norm": 0.57421875, + "learning_rate": 9.983498349834984e-05, + "loss": 1.0933, + "step": 605 + }, + { + "epoch": 0.7544836116264688, + "grad_norm": 0.4765625, + "learning_rate": 0.00010066006600660067, + "loss": 1.0881, + "step": 610 + }, + { + "epoch": 0.7606679035250464, + "grad_norm": 0.4375, + "learning_rate": 0.0001014851485148515, + "loss": 1.0926, + "step": 615 + }, + { + "epoch": 0.766852195423624, + "grad_norm": 0.69921875, + "learning_rate": 0.00010231023102310232, + "loss": 1.0931, + "step": 620 + }, + { + "epoch": 0.7730364873222016, + "grad_norm": 0.55859375, + "learning_rate": 0.00010313531353135315, + "loss": 1.0923, + "step": 625 + }, + { + "epoch": 0.7792207792207793, + "grad_norm": 0.62109375, + "learning_rate": 0.00010396039603960397, + "loss": 1.0854, + "step": 630 + }, + { + "epoch": 0.7854050711193569, + "grad_norm": 0.5078125, + "learning_rate": 0.00010478547854785479, + "loss": 1.0864, + "step": 635 + }, + { + "epoch": 0.7915893630179345, + "grad_norm": 0.447265625, + "learning_rate": 0.0001056105610561056, + "loss": 1.0863, + "step": 640 + }, + { + "epoch": 0.7977736549165121, + "grad_norm": 0.73046875, + "learning_rate": 0.00010643564356435645, + "loss": 1.0843, + "step": 645 + }, + { + "epoch": 0.8039579468150897, + "grad_norm": 0.71484375, + "learning_rate": 0.00010726072607260727, + "loss": 1.0822, + "step": 650 + }, + { + "epoch": 0.8101422387136673, + "grad_norm": 0.76953125, + "learning_rate": 0.00010808580858085808, + "loss": 1.0802, + "step": 655 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.55078125, + "learning_rate": 0.00010891089108910893, + "loss": 1.0822, + "step": 660 + }, + { + "epoch": 0.8225108225108225, + "grad_norm": 0.5546875, + "learning_rate": 0.00010973597359735975, + "loss": 1.076, + "step": 665 + }, + { + "epoch": 0.8286951144094001, + "grad_norm": 0.515625, + "learning_rate": 0.00011056105610561056, + "loss": 1.0688, + "step": 670 + }, + { + "epoch": 0.8348794063079777, + "grad_norm": 0.515625, + "learning_rate": 0.0001113861386138614, + "loss": 1.0839, + "step": 675 + }, + { + "epoch": 0.8410636982065554, + "grad_norm": 0.68359375, + "learning_rate": 0.00011221122112211223, + "loss": 1.0802, + "step": 680 + }, + { + "epoch": 0.847247990105133, + "grad_norm": 0.515625, + "learning_rate": 0.00011303630363036303, + "loss": 1.0699, + "step": 685 + }, + { + "epoch": 0.8534322820037106, + "grad_norm": 0.5546875, + "learning_rate": 0.00011386138613861385, + "loss": 1.0692, + "step": 690 + }, + { + "epoch": 0.8596165739022882, + "grad_norm": 0.470703125, + "learning_rate": 0.0001146864686468647, + "loss": 1.0756, + "step": 695 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 0.53125, + "learning_rate": 0.00011551155115511551, + "loss": 1.0819, + "step": 700 + }, + { + "epoch": 0.8719851576994434, + "grad_norm": 0.478515625, + "learning_rate": 0.00011633663366336633, + "loss": 1.071, + "step": 705 + }, + { + "epoch": 0.878169449598021, + "grad_norm": 0.515625, + "learning_rate": 0.00011716171617161718, + "loss": 1.069, + "step": 710 + }, + { + "epoch": 0.8843537414965986, + "grad_norm": 0.62890625, + "learning_rate": 0.00011798679867986799, + "loss": 1.0676, + "step": 715 + }, + { + "epoch": 0.8905380333951762, + "grad_norm": 0.69140625, + "learning_rate": 0.00011881188118811881, + "loss": 1.0562, + "step": 720 + }, + { + "epoch": 0.8967223252937538, + "grad_norm": 0.5859375, + "learning_rate": 0.00011963696369636966, + "loss": 1.0725, + "step": 725 + }, + { + "epoch": 0.9029066171923315, + "grad_norm": 0.60546875, + "learning_rate": 0.00012046204620462047, + "loss": 1.0705, + "step": 730 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.5703125, + "learning_rate": 0.00012128712871287129, + "loss": 1.071, + "step": 735 + }, + { + "epoch": 0.9152752009894867, + "grad_norm": 0.64453125, + "learning_rate": 0.00012211221122112212, + "loss": 1.0615, + "step": 740 + }, + { + "epoch": 0.9214594928880643, + "grad_norm": 0.56640625, + "learning_rate": 0.00012293729372937296, + "loss": 1.0725, + "step": 745 + }, + { + "epoch": 0.9276437847866419, + "grad_norm": 0.609375, + "learning_rate": 0.00012376237623762376, + "loss": 1.0583, + "step": 750 + }, + { + "epoch": 0.9338280766852195, + "grad_norm": 0.7265625, + "learning_rate": 0.0001245874587458746, + "loss": 1.059, + "step": 755 + }, + { + "epoch": 0.9400123685837971, + "grad_norm": 0.54296875, + "learning_rate": 0.00012541254125412543, + "loss": 1.0561, + "step": 760 + }, + { + "epoch": 0.9461966604823747, + "grad_norm": 0.7578125, + "learning_rate": 0.00012623762376237624, + "loss": 1.0576, + "step": 765 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.5859375, + "learning_rate": 0.00012706270627062708, + "loss": 1.056, + "step": 770 + }, + { + "epoch": 0.95856524427953, + "grad_norm": 0.58984375, + "learning_rate": 0.0001278877887788779, + "loss": 1.0583, + "step": 775 + }, + { + "epoch": 0.9647495361781077, + "grad_norm": 0.53515625, + "learning_rate": 0.00012871287128712872, + "loss": 1.0501, + "step": 780 + }, + { + "epoch": 0.9709338280766853, + "grad_norm": 0.578125, + "learning_rate": 0.00012953795379537955, + "loss": 1.057, + "step": 785 + }, + { + "epoch": 0.9771181199752629, + "grad_norm": 0.55078125, + "learning_rate": 0.00013036303630363036, + "loss": 1.0529, + "step": 790 + }, + { + "epoch": 0.9833024118738405, + "grad_norm": 0.482421875, + "learning_rate": 0.0001311881188118812, + "loss": 1.0481, + "step": 795 + }, + { + "epoch": 0.9894867037724181, + "grad_norm": 0.66796875, + "learning_rate": 0.00013201320132013203, + "loss": 1.0671, + "step": 800 + }, + { + "epoch": 0.9956709956709957, + "grad_norm": 0.546875, + "learning_rate": 0.00013283828382838284, + "loss": 1.0494, + "step": 805 + }, + { + "epoch": 0.9993815708101422, + "eval_loss": 2.4662458896636963, + "eval_runtime": 0.6569, + "eval_samples_per_second": 15.223, + "eval_steps_per_second": 1.522, + "step": 808 + }, + { + "epoch": 1.0018552875695732, + "grad_norm": 0.6953125, + "learning_rate": 0.00013366336633663367, + "loss": 1.0467, + "step": 810 + }, + { + "epoch": 1.008039579468151, + "grad_norm": 0.7578125, + "learning_rate": 0.0001344884488448845, + "loss": 1.0493, + "step": 815 + }, + { + "epoch": 1.0142238713667284, + "grad_norm": 0.70703125, + "learning_rate": 0.00013531353135313532, + "loss": 1.0417, + "step": 820 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.640625, + "learning_rate": 0.00013613861386138615, + "loss": 1.0413, + "step": 825 + }, + { + "epoch": 1.0265924551638836, + "grad_norm": 0.8046875, + "learning_rate": 0.00013696369636963699, + "loss": 1.07, + "step": 830 + }, + { + "epoch": 1.0327767470624614, + "grad_norm": 0.6015625, + "learning_rate": 0.0001377887788778878, + "loss": 1.0425, + "step": 835 + }, + { + "epoch": 1.0389610389610389, + "grad_norm": 0.54296875, + "learning_rate": 0.0001386138613861386, + "loss": 1.0537, + "step": 840 + }, + { + "epoch": 1.0451453308596166, + "grad_norm": 0.5625, + "learning_rate": 0.00013943894389438946, + "loss": 1.0425, + "step": 845 + }, + { + "epoch": 1.051329622758194, + "grad_norm": 0.6171875, + "learning_rate": 0.00014026402640264027, + "loss": 1.0478, + "step": 850 + }, + { + "epoch": 1.0575139146567718, + "grad_norm": 0.48046875, + "learning_rate": 0.00014108910891089108, + "loss": 1.0427, + "step": 855 + }, + { + "epoch": 1.0636982065553493, + "grad_norm": 0.55078125, + "learning_rate": 0.00014191419141914194, + "loss": 1.0406, + "step": 860 + }, + { + "epoch": 1.069882498453927, + "grad_norm": 0.466796875, + "learning_rate": 0.00014273927392739275, + "loss": 1.0384, + "step": 865 + }, + { + "epoch": 1.0760667903525047, + "grad_norm": 0.61328125, + "learning_rate": 0.00014356435643564356, + "loss": 1.031, + "step": 870 + }, + { + "epoch": 1.0822510822510822, + "grad_norm": 0.6484375, + "learning_rate": 0.00014438943894389442, + "loss": 1.0371, + "step": 875 + }, + { + "epoch": 1.08843537414966, + "grad_norm": 0.482421875, + "learning_rate": 0.00014521452145214523, + "loss": 1.0256, + "step": 880 + }, + { + "epoch": 1.0946196660482375, + "grad_norm": 0.6640625, + "learning_rate": 0.00014603960396039603, + "loss": 1.049, + "step": 885 + }, + { + "epoch": 1.1008039579468152, + "grad_norm": 0.84765625, + "learning_rate": 0.00014686468646864687, + "loss": 1.0373, + "step": 890 + }, + { + "epoch": 1.1069882498453927, + "grad_norm": 0.48828125, + "learning_rate": 0.0001476897689768977, + "loss": 1.0404, + "step": 895 + }, + { + "epoch": 1.1131725417439704, + "grad_norm": 0.54296875, + "learning_rate": 0.0001485148514851485, + "loss": 1.0224, + "step": 900 + }, + { + "epoch": 1.119356833642548, + "grad_norm": 0.455078125, + "learning_rate": 0.00014933993399339935, + "loss": 1.035, + "step": 905 + }, + { + "epoch": 1.1255411255411256, + "grad_norm": 0.609375, + "learning_rate": 0.00015016501650165018, + "loss": 1.0376, + "step": 910 + }, + { + "epoch": 1.1317254174397031, + "grad_norm": 0.55078125, + "learning_rate": 0.000150990099009901, + "loss": 1.0351, + "step": 915 + }, + { + "epoch": 1.1379097093382808, + "grad_norm": 0.65625, + "learning_rate": 0.00015181518151815182, + "loss": 1.041, + "step": 920 + }, + { + "epoch": 1.1440940012368583, + "grad_norm": 0.66015625, + "learning_rate": 0.00015264026402640266, + "loss": 1.0341, + "step": 925 + }, + { + "epoch": 1.150278293135436, + "grad_norm": 0.58203125, + "learning_rate": 0.00015346534653465347, + "loss": 1.0292, + "step": 930 + }, + { + "epoch": 1.1564625850340136, + "grad_norm": 0.5625, + "learning_rate": 0.0001542904290429043, + "loss": 1.0407, + "step": 935 + }, + { + "epoch": 1.1626468769325913, + "grad_norm": 0.6015625, + "learning_rate": 0.0001551155115511551, + "loss": 1.0252, + "step": 940 + }, + { + "epoch": 1.1688311688311688, + "grad_norm": 0.6328125, + "learning_rate": 0.00015594059405940594, + "loss": 1.0396, + "step": 945 + }, + { + "epoch": 1.1750154607297465, + "grad_norm": 0.63671875, + "learning_rate": 0.00015676567656765678, + "loss": 1.0375, + "step": 950 + }, + { + "epoch": 1.181199752628324, + "grad_norm": 0.51953125, + "learning_rate": 0.00015759075907590759, + "loss": 1.0245, + "step": 955 + }, + { + "epoch": 1.1873840445269017, + "grad_norm": 0.49609375, + "learning_rate": 0.00015841584158415842, + "loss": 1.0366, + "step": 960 + }, + { + "epoch": 1.1935683364254792, + "grad_norm": 0.60546875, + "learning_rate": 0.00015924092409240926, + "loss": 1.0336, + "step": 965 + }, + { + "epoch": 1.199752628324057, + "grad_norm": 0.52734375, + "learning_rate": 0.00016006600660066006, + "loss": 1.0353, + "step": 970 + }, + { + "epoch": 1.2059369202226344, + "grad_norm": 0.58203125, + "learning_rate": 0.0001608910891089109, + "loss": 1.0294, + "step": 975 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.6796875, + "learning_rate": 0.00016171617161716173, + "loss": 1.0297, + "step": 980 + }, + { + "epoch": 1.2183055040197897, + "grad_norm": 0.494140625, + "learning_rate": 0.00016254125412541254, + "loss": 1.0302, + "step": 985 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 0.59375, + "learning_rate": 0.00016336633663366338, + "loss": 1.0242, + "step": 990 + }, + { + "epoch": 1.2306740878169449, + "grad_norm": 0.765625, + "learning_rate": 0.0001641914191419142, + "loss": 1.0314, + "step": 995 + }, + { + "epoch": 1.2368583797155226, + "grad_norm": 0.5859375, + "learning_rate": 0.00016501650165016502, + "loss": 1.0194, + "step": 1000 + }, + { + "epoch": 1.2430426716141, + "grad_norm": 0.5546875, + "learning_rate": 0.00016584158415841585, + "loss": 1.0178, + "step": 1005 + }, + { + "epoch": 1.2492269635126778, + "grad_norm": 0.59765625, + "learning_rate": 0.0001666666666666667, + "loss": 1.0241, + "step": 1010 + }, + { + "epoch": 1.2554112554112553, + "grad_norm": 0.68359375, + "learning_rate": 0.0001674917491749175, + "loss": 1.0271, + "step": 1015 + }, + { + "epoch": 1.261595547309833, + "grad_norm": 0.515625, + "learning_rate": 0.00016831683168316833, + "loss": 1.0307, + "step": 1020 + }, + { + "epoch": 1.2677798392084108, + "grad_norm": 0.57421875, + "learning_rate": 0.00016914191419141917, + "loss": 1.0366, + "step": 1025 + }, + { + "epoch": 1.2739641311069883, + "grad_norm": 0.86328125, + "learning_rate": 0.00016996699669966997, + "loss": 1.0276, + "step": 1030 + }, + { + "epoch": 1.2801484230055657, + "grad_norm": 0.8125, + "learning_rate": 0.0001707920792079208, + "loss": 1.0177, + "step": 1035 + }, + { + "epoch": 1.2863327149041435, + "grad_norm": 0.51953125, + "learning_rate": 0.00017161716171617162, + "loss": 1.0301, + "step": 1040 + }, + { + "epoch": 1.2925170068027212, + "grad_norm": 0.5546875, + "learning_rate": 0.00017244224422442245, + "loss": 1.0193, + "step": 1045 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 0.5078125, + "learning_rate": 0.00017326732673267329, + "loss": 1.0295, + "step": 1050 + }, + { + "epoch": 1.3048855905998762, + "grad_norm": 0.490234375, + "learning_rate": 0.0001740924092409241, + "loss": 1.0248, + "step": 1055 + }, + { + "epoch": 1.311069882498454, + "grad_norm": 0.6328125, + "learning_rate": 0.00017491749174917493, + "loss": 1.0218, + "step": 1060 + }, + { + "epoch": 1.3172541743970316, + "grad_norm": 0.6171875, + "learning_rate": 0.00017574257425742576, + "loss": 1.0141, + "step": 1065 + }, + { + "epoch": 1.3234384662956091, + "grad_norm": 0.5546875, + "learning_rate": 0.00017656765676567657, + "loss": 1.0232, + "step": 1070 + }, + { + "epoch": 1.3296227581941866, + "grad_norm": 0.56640625, + "learning_rate": 0.0001773927392739274, + "loss": 1.0198, + "step": 1075 + }, + { + "epoch": 1.3358070500927643, + "grad_norm": 0.490234375, + "learning_rate": 0.00017821782178217824, + "loss": 1.0236, + "step": 1080 + }, + { + "epoch": 1.341991341991342, + "grad_norm": 0.5625, + "learning_rate": 0.00017904290429042905, + "loss": 1.031, + "step": 1085 + }, + { + "epoch": 1.3481756338899196, + "grad_norm": 0.51171875, + "learning_rate": 0.00017986798679867986, + "loss": 1.0143, + "step": 1090 + }, + { + "epoch": 1.3543599257884973, + "grad_norm": 0.74609375, + "learning_rate": 0.00018069306930693072, + "loss": 1.0273, + "step": 1095 + }, + { + "epoch": 1.3605442176870748, + "grad_norm": 0.59765625, + "learning_rate": 0.00018151815181518153, + "loss": 1.0228, + "step": 1100 + }, + { + "epoch": 1.3667285095856525, + "grad_norm": 0.6171875, + "learning_rate": 0.00018234323432343233, + "loss": 1.0276, + "step": 1105 + }, + { + "epoch": 1.37291280148423, + "grad_norm": 0.5703125, + "learning_rate": 0.0001831683168316832, + "loss": 1.0196, + "step": 1110 + }, + { + "epoch": 1.3790970933828077, + "grad_norm": 0.466796875, + "learning_rate": 0.000183993399339934, + "loss": 1.0158, + "step": 1115 + }, + { + "epoch": 1.3852813852813852, + "grad_norm": 0.5859375, + "learning_rate": 0.0001848184818481848, + "loss": 1.0063, + "step": 1120 + }, + { + "epoch": 1.391465677179963, + "grad_norm": 0.6484375, + "learning_rate": 0.00018564356435643567, + "loss": 1.0192, + "step": 1125 + }, + { + "epoch": 1.3976499690785404, + "grad_norm": 0.84765625, + "learning_rate": 0.00018646864686468648, + "loss": 1.0265, + "step": 1130 + }, + { + "epoch": 1.4038342609771182, + "grad_norm": 0.52734375, + "learning_rate": 0.0001872937293729373, + "loss": 1.0139, + "step": 1135 + }, + { + "epoch": 1.4100185528756957, + "grad_norm": 0.54296875, + "learning_rate": 0.00018811881188118812, + "loss": 1.0196, + "step": 1140 + }, + { + "epoch": 1.4162028447742734, + "grad_norm": 0.72265625, + "learning_rate": 0.00018894389438943896, + "loss": 1.0174, + "step": 1145 + }, + { + "epoch": 1.4223871366728509, + "grad_norm": 0.80859375, + "learning_rate": 0.00018976897689768977, + "loss": 1.0146, + "step": 1150 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.6953125, + "learning_rate": 0.0001905940594059406, + "loss": 1.0182, + "step": 1155 + }, + { + "epoch": 1.434755720470006, + "grad_norm": 0.47265625, + "learning_rate": 0.00019141914191419144, + "loss": 1.0137, + "step": 1160 + }, + { + "epoch": 1.4409400123685838, + "grad_norm": 0.5234375, + "learning_rate": 0.00019224422442244224, + "loss": 1.0134, + "step": 1165 + }, + { + "epoch": 1.4471243042671613, + "grad_norm": 0.7109375, + "learning_rate": 0.00019306930693069308, + "loss": 1.0123, + "step": 1170 + }, + { + "epoch": 1.453308596165739, + "grad_norm": 0.54296875, + "learning_rate": 0.0001938943894389439, + "loss": 1.011, + "step": 1175 + }, + { + "epoch": 1.4594928880643168, + "grad_norm": 0.75, + "learning_rate": 0.00019471947194719472, + "loss": 0.9988, + "step": 1180 + }, + { + "epoch": 1.4656771799628943, + "grad_norm": 0.60546875, + "learning_rate": 0.00019554455445544556, + "loss": 1.0111, + "step": 1185 + }, + { + "epoch": 1.4718614718614718, + "grad_norm": 0.51953125, + "learning_rate": 0.00019636963696369636, + "loss": 1.0151, + "step": 1190 + }, + { + "epoch": 1.4780457637600495, + "grad_norm": 0.6171875, + "learning_rate": 0.0001971947194719472, + "loss": 1.0119, + "step": 1195 + }, + { + "epoch": 1.4842300556586272, + "grad_norm": 0.71484375, + "learning_rate": 0.00019801980198019803, + "loss": 1.0085, + "step": 1200 + }, + { + "epoch": 1.4904143475572047, + "grad_norm": 0.7421875, + "learning_rate": 0.00019884488448844884, + "loss": 1.0071, + "step": 1205 + }, + { + "epoch": 1.4965986394557822, + "grad_norm": 0.6171875, + "learning_rate": 0.00019966996699669968, + "loss": 1.0071, + "step": 1210 + }, + { + "epoch": 1.50278293135436, + "grad_norm": 0.51953125, + "learning_rate": 0.0001999999626730957, + "loss": 1.0041, + "step": 1215 + }, + { + "epoch": 1.5089672232529376, + "grad_norm": 0.73046875, + "learning_rate": 0.00019999973456433681, + "loss": 1.0051, + "step": 1220 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.68359375, + "learning_rate": 0.00019999929908446061, + "loss": 1.0011, + "step": 1225 + }, + { + "epoch": 1.5213358070500926, + "grad_norm": 0.59375, + "learning_rate": 0.00019999865623437013, + "loss": 1.007, + "step": 1230 + }, + { + "epoch": 1.5275200989486704, + "grad_norm": 0.73828125, + "learning_rate": 0.00019999780601539848, + "loss": 0.9903, + "step": 1235 + }, + { + "epoch": 1.533704390847248, + "grad_norm": 0.67578125, + "learning_rate": 0.00019999674842930876, + "loss": 1.0055, + "step": 1240 + }, + { + "epoch": 1.5398886827458256, + "grad_norm": 0.859375, + "learning_rate": 0.0001999954834782941, + "loss": 0.9958, + "step": 1245 + }, + { + "epoch": 1.546072974644403, + "grad_norm": 0.6328125, + "learning_rate": 0.00019999401116497763, + "loss": 1.0027, + "step": 1250 + }, + { + "epoch": 1.5522572665429808, + "grad_norm": 0.61328125, + "learning_rate": 0.00019999233149241253, + "loss": 1.0078, + "step": 1255 + }, + { + "epoch": 1.5584415584415585, + "grad_norm": 0.609375, + "learning_rate": 0.000199990444464082, + "loss": 1.0108, + "step": 1260 + }, + { + "epoch": 1.564625850340136, + "grad_norm": 0.5390625, + "learning_rate": 0.0001999883500838992, + "loss": 1.0173, + "step": 1265 + }, + { + "epoch": 1.5708101422387135, + "grad_norm": 0.546875, + "learning_rate": 0.00019998604835620717, + "loss": 1.0167, + "step": 1270 + }, + { + "epoch": 1.5769944341372912, + "grad_norm": 0.59375, + "learning_rate": 0.00019998353928577919, + "loss": 1.0032, + "step": 1275 + }, + { + "epoch": 1.583178726035869, + "grad_norm": 0.482421875, + "learning_rate": 0.00019998082287781826, + "loss": 1.0133, + "step": 1280 + }, + { + "epoch": 1.5893630179344465, + "grad_norm": 0.474609375, + "learning_rate": 0.00019997789913795747, + "loss": 0.9938, + "step": 1285 + }, + { + "epoch": 1.595547309833024, + "grad_norm": 0.53515625, + "learning_rate": 0.00019997476807225985, + "loss": 0.9967, + "step": 1290 + }, + { + "epoch": 1.601731601731602, + "grad_norm": 0.6953125, + "learning_rate": 0.00019997142968721833, + "loss": 1.0054, + "step": 1295 + }, + { + "epoch": 1.6079158936301794, + "grad_norm": 0.54296875, + "learning_rate": 0.00019996788398975578, + "loss": 1.0033, + "step": 1300 + }, + { + "epoch": 1.614100185528757, + "grad_norm": 0.47265625, + "learning_rate": 0.00019996413098722493, + "loss": 0.9963, + "step": 1305 + }, + { + "epoch": 1.6202844774273346, + "grad_norm": 0.6640625, + "learning_rate": 0.0001999601706874085, + "loss": 1.0035, + "step": 1310 + }, + { + "epoch": 1.6264687693259123, + "grad_norm": 0.482421875, + "learning_rate": 0.000199956003098519, + "loss": 0.9968, + "step": 1315 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.77734375, + "learning_rate": 0.00019995162822919883, + "loss": 0.9884, + "step": 1320 + }, + { + "epoch": 1.6388373531230673, + "grad_norm": 0.9140625, + "learning_rate": 0.00019994704608852022, + "loss": 1.0013, + "step": 1325 + }, + { + "epoch": 1.645021645021645, + "grad_norm": 0.7734375, + "learning_rate": 0.00019994225668598526, + "loss": 1.0153, + "step": 1330 + }, + { + "epoch": 1.6512059369202228, + "grad_norm": 0.5625, + "learning_rate": 0.00019993726003152582, + "loss": 1.0002, + "step": 1335 + }, + { + "epoch": 1.6573902288188003, + "grad_norm": 0.490234375, + "learning_rate": 0.0001999320561355035, + "loss": 1.0107, + "step": 1340 + }, + { + "epoch": 1.6635745207173778, + "grad_norm": 0.486328125, + "learning_rate": 0.00019992664500870976, + "loss": 1.0183, + "step": 1345 + }, + { + "epoch": 1.6697588126159555, + "grad_norm": 0.578125, + "learning_rate": 0.00019992102666236566, + "loss": 1.0067, + "step": 1350 + }, + { + "epoch": 1.6759431045145332, + "grad_norm": 0.478515625, + "learning_rate": 0.00019991520110812215, + "loss": 1.0032, + "step": 1355 + }, + { + "epoch": 1.6821273964131107, + "grad_norm": 0.51171875, + "learning_rate": 0.00019990916835805974, + "loss": 0.994, + "step": 1360 + }, + { + "epoch": 1.6883116883116882, + "grad_norm": 0.451171875, + "learning_rate": 0.00019990292842468868, + "loss": 0.9995, + "step": 1365 + }, + { + "epoch": 1.694495980210266, + "grad_norm": 0.47265625, + "learning_rate": 0.00019989648132094873, + "loss": 0.9878, + "step": 1370 + }, + { + "epoch": 1.7006802721088436, + "grad_norm": 0.66015625, + "learning_rate": 0.00019988982706020946, + "loss": 1.002, + "step": 1375 + }, + { + "epoch": 1.7068645640074211, + "grad_norm": 0.65234375, + "learning_rate": 0.00019988296565626987, + "loss": 0.9913, + "step": 1380 + }, + { + "epoch": 1.7130488559059986, + "grad_norm": 0.55078125, + "learning_rate": 0.00019987589712335856, + "loss": 1.0095, + "step": 1385 + }, + { + "epoch": 1.7192331478045764, + "grad_norm": 0.65625, + "learning_rate": 0.0001998686214761337, + "loss": 0.9962, + "step": 1390 + }, + { + "epoch": 1.725417439703154, + "grad_norm": 0.6875, + "learning_rate": 0.0001998611387296829, + "loss": 0.9979, + "step": 1395 + }, + { + "epoch": 1.7316017316017316, + "grad_norm": 0.55859375, + "learning_rate": 0.00019985344889952327, + "loss": 1.0057, + "step": 1400 + }, + { + "epoch": 1.737786023500309, + "grad_norm": 0.55078125, + "learning_rate": 0.00019984555200160128, + "loss": 0.9932, + "step": 1405 + }, + { + "epoch": 1.7439703153988868, + "grad_norm": 0.546875, + "learning_rate": 0.00019983744805229296, + "loss": 0.9973, + "step": 1410 + }, + { + "epoch": 1.7501546072974645, + "grad_norm": 0.59375, + "learning_rate": 0.00019982913706840353, + "loss": 0.9919, + "step": 1415 + }, + { + "epoch": 1.756338899196042, + "grad_norm": 0.443359375, + "learning_rate": 0.00019982061906716764, + "loss": 0.9875, + "step": 1420 + }, + { + "epoch": 1.7625231910946195, + "grad_norm": 0.5234375, + "learning_rate": 0.00019981189406624922, + "loss": 1.0013, + "step": 1425 + }, + { + "epoch": 1.7687074829931972, + "grad_norm": 0.59765625, + "learning_rate": 0.00019980296208374143, + "loss": 0.9961, + "step": 1430 + }, + { + "epoch": 1.774891774891775, + "grad_norm": 0.5, + "learning_rate": 0.00019979382313816668, + "loss": 0.9928, + "step": 1435 + }, + { + "epoch": 1.7810760667903525, + "grad_norm": 0.462890625, + "learning_rate": 0.00019978447724847652, + "loss": 0.9939, + "step": 1440 + }, + { + "epoch": 1.78726035868893, + "grad_norm": 0.5390625, + "learning_rate": 0.00019977492443405174, + "loss": 0.987, + "step": 1445 + }, + { + "epoch": 1.7934446505875077, + "grad_norm": 0.486328125, + "learning_rate": 0.00019976516471470216, + "loss": 1.0033, + "step": 1450 + }, + { + "epoch": 1.7996289424860854, + "grad_norm": 0.5625, + "learning_rate": 0.00019975519811066663, + "loss": 0.9884, + "step": 1455 + }, + { + "epoch": 1.805813234384663, + "grad_norm": 0.66796875, + "learning_rate": 0.0001997450246426131, + "loss": 0.9823, + "step": 1460 + }, + { + "epoch": 1.8119975262832406, + "grad_norm": 0.5546875, + "learning_rate": 0.00019973464433163844, + "loss": 0.9838, + "step": 1465 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.66796875, + "learning_rate": 0.0001997240571992685, + "loss": 0.993, + "step": 1470 + }, + { + "epoch": 1.8243661100803958, + "grad_norm": 0.5625, + "learning_rate": 0.00019971326326745793, + "loss": 0.9835, + "step": 1475 + }, + { + "epoch": 1.8305504019789733, + "grad_norm": 0.44140625, + "learning_rate": 0.00019970226255859038, + "loss": 0.99, + "step": 1480 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.59375, + "learning_rate": 0.00019969105509547812, + "loss": 0.9923, + "step": 1485 + }, + { + "epoch": 1.8429189857761288, + "grad_norm": 0.55859375, + "learning_rate": 0.0001996796409013623, + "loss": 0.998, + "step": 1490 + }, + { + "epoch": 1.8491032776747063, + "grad_norm": 0.62890625, + "learning_rate": 0.0001996680199999127, + "loss": 0.9902, + "step": 1495 + }, + { + "epoch": 1.8552875695732838, + "grad_norm": 0.6875, + "learning_rate": 0.0001996561924152278, + "loss": 0.9845, + "step": 1500 + }, + { + "epoch": 1.8614718614718615, + "grad_norm": 0.703125, + "learning_rate": 0.0001996441581718347, + "loss": 0.9722, + "step": 1505 + }, + { + "epoch": 1.8676561533704392, + "grad_norm": 0.53515625, + "learning_rate": 0.00019963191729468888, + "loss": 0.9983, + "step": 1510 + }, + { + "epoch": 1.8738404452690167, + "grad_norm": 0.5, + "learning_rate": 0.00019961946980917456, + "loss": 0.9817, + "step": 1515 + }, + { + "epoch": 1.8800247371675942, + "grad_norm": 0.59375, + "learning_rate": 0.00019960681574110426, + "loss": 0.9803, + "step": 1520 + }, + { + "epoch": 1.886209029066172, + "grad_norm": 0.498046875, + "learning_rate": 0.00019959395511671898, + "loss": 0.9839, + "step": 1525 + }, + { + "epoch": 1.8923933209647497, + "grad_norm": 0.55078125, + "learning_rate": 0.00019958088796268793, + "loss": 0.9842, + "step": 1530 + }, + { + "epoch": 1.8985776128633272, + "grad_norm": 0.546875, + "learning_rate": 0.00019956761430610874, + "loss": 0.9782, + "step": 1535 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.462890625, + "learning_rate": 0.0001995541341745072, + "loss": 0.9888, + "step": 1540 + }, + { + "epoch": 1.9109461966604824, + "grad_norm": 0.451171875, + "learning_rate": 0.0001995404475958373, + "loss": 0.9859, + "step": 1545 + }, + { + "epoch": 1.91713048855906, + "grad_norm": 0.5078125, + "learning_rate": 0.0001995265545984811, + "loss": 0.9986, + "step": 1550 + }, + { + "epoch": 1.9233147804576376, + "grad_norm": 0.51953125, + "learning_rate": 0.00019951245521124874, + "loss": 0.9737, + "step": 1555 + }, + { + "epoch": 1.929499072356215, + "grad_norm": 0.47265625, + "learning_rate": 0.00019949814946337838, + "loss": 0.977, + "step": 1560 + }, + { + "epoch": 1.9356833642547928, + "grad_norm": 0.6015625, + "learning_rate": 0.00019948363738453607, + "loss": 0.9834, + "step": 1565 + }, + { + "epoch": 1.9418676561533705, + "grad_norm": 0.6015625, + "learning_rate": 0.00019946891900481578, + "loss": 0.9903, + "step": 1570 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 0.484375, + "learning_rate": 0.00019945399435473922, + "loss": 0.9813, + "step": 1575 + }, + { + "epoch": 1.9542362399505255, + "grad_norm": 0.5234375, + "learning_rate": 0.0001994388634652559, + "loss": 0.9774, + "step": 1580 + }, + { + "epoch": 1.9604205318491033, + "grad_norm": 0.625, + "learning_rate": 0.00019942352636774296, + "loss": 0.9785, + "step": 1585 + }, + { + "epoch": 1.966604823747681, + "grad_norm": 0.80859375, + "learning_rate": 0.00019940798309400526, + "loss": 0.9799, + "step": 1590 + }, + { + "epoch": 1.9727891156462585, + "grad_norm": 0.64453125, + "learning_rate": 0.0001993922336762751, + "loss": 0.969, + "step": 1595 + }, + { + "epoch": 1.978973407544836, + "grad_norm": 0.6484375, + "learning_rate": 0.00019937627814721237, + "loss": 0.9758, + "step": 1600 + }, + { + "epoch": 1.9851576994434137, + "grad_norm": 0.49609375, + "learning_rate": 0.00019936011653990426, + "loss": 0.9819, + "step": 1605 + }, + { + "epoch": 1.9913419913419914, + "grad_norm": 0.447265625, + "learning_rate": 0.00019934374888786537, + "loss": 0.9826, + "step": 1610 + }, + { + "epoch": 1.997526283240569, + "grad_norm": 0.515625, + "learning_rate": 0.0001993271752250376, + "loss": 0.9661, + "step": 1615 + }, + { + "epoch": 2.0, + "eval_loss": 2.4507155418395996, + "eval_runtime": 0.5376, + "eval_samples_per_second": 18.6, + "eval_steps_per_second": 1.86, + "step": 1617 + }, + { + "epoch": 2.0037105751391464, + "grad_norm": 0.44921875, + "learning_rate": 0.00019931039558578997, + "loss": 0.9843, + "step": 1620 + }, + { + "epoch": 2.0098948670377244, + "grad_norm": 0.55859375, + "learning_rate": 0.00019929341000491876, + "loss": 0.9676, + "step": 1625 + }, + { + "epoch": 2.016079158936302, + "grad_norm": 0.458984375, + "learning_rate": 0.00019927621851764725, + "loss": 0.9727, + "step": 1630 + }, + { + "epoch": 2.0222634508348794, + "grad_norm": 0.46484375, + "learning_rate": 0.00019925882115962568, + "loss": 0.9764, + "step": 1635 + }, + { + "epoch": 2.028447742733457, + "grad_norm": 0.49609375, + "learning_rate": 0.00019924121796693127, + "loss": 0.9708, + "step": 1640 + }, + { + "epoch": 2.034632034632035, + "grad_norm": 0.61328125, + "learning_rate": 0.00019922340897606805, + "loss": 0.9728, + "step": 1645 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.486328125, + "learning_rate": 0.0001992053942239668, + "loss": 0.9724, + "step": 1650 + }, + { + "epoch": 2.04700061842919, + "grad_norm": 0.65625, + "learning_rate": 0.00019918717374798502, + "loss": 0.9669, + "step": 1655 + }, + { + "epoch": 2.0531849103277673, + "grad_norm": 0.61328125, + "learning_rate": 0.00019916874758590684, + "loss": 0.9634, + "step": 1660 + }, + { + "epoch": 2.0593692022263452, + "grad_norm": 0.546875, + "learning_rate": 0.00019915011577594286, + "loss": 0.9761, + "step": 1665 + }, + { + "epoch": 2.0655534941249227, + "grad_norm": 0.546875, + "learning_rate": 0.00019913127835673023, + "loss": 0.9667, + "step": 1670 + }, + { + "epoch": 2.0717377860235002, + "grad_norm": 0.6484375, + "learning_rate": 0.00019911223536733235, + "loss": 0.9747, + "step": 1675 + }, + { + "epoch": 2.0779220779220777, + "grad_norm": 0.69140625, + "learning_rate": 0.00019909298684723904, + "loss": 0.9682, + "step": 1680 + }, + { + "epoch": 2.0841063698206557, + "grad_norm": 0.69921875, + "learning_rate": 0.00019907353283636628, + "loss": 0.9606, + "step": 1685 + }, + { + "epoch": 2.090290661719233, + "grad_norm": 0.51953125, + "learning_rate": 0.00019905387337505612, + "loss": 0.9635, + "step": 1690 + }, + { + "epoch": 2.0964749536178107, + "grad_norm": 0.5234375, + "learning_rate": 0.00019903400850407676, + "loss": 0.9612, + "step": 1695 + }, + { + "epoch": 2.102659245516388, + "grad_norm": 0.5078125, + "learning_rate": 0.0001990139382646223, + "loss": 0.9682, + "step": 1700 + }, + { + "epoch": 2.108843537414966, + "grad_norm": 0.62109375, + "learning_rate": 0.00019899366269831274, + "loss": 0.9599, + "step": 1705 + }, + { + "epoch": 2.1150278293135436, + "grad_norm": 0.45703125, + "learning_rate": 0.00019897318184719385, + "loss": 0.9769, + "step": 1710 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.59765625, + "learning_rate": 0.00019895249575373712, + "loss": 0.9683, + "step": 1715 + }, + { + "epoch": 2.1273964131106986, + "grad_norm": 0.72265625, + "learning_rate": 0.00019893160446083963, + "loss": 0.9577, + "step": 1720 + }, + { + "epoch": 2.1335807050092765, + "grad_norm": 0.5703125, + "learning_rate": 0.000198910508011824, + "loss": 0.9543, + "step": 1725 + }, + { + "epoch": 2.139764996907854, + "grad_norm": 0.42578125, + "learning_rate": 0.00019888920645043831, + "loss": 0.9643, + "step": 1730 + }, + { + "epoch": 2.1459492888064315, + "grad_norm": 0.55078125, + "learning_rate": 0.00019886769982085597, + "loss": 0.9651, + "step": 1735 + }, + { + "epoch": 2.1521335807050095, + "grad_norm": 0.48046875, + "learning_rate": 0.00019884598816767563, + "loss": 0.9451, + "step": 1740 + }, + { + "epoch": 2.158317872603587, + "grad_norm": 0.53515625, + "learning_rate": 0.00019882407153592107, + "loss": 0.9691, + "step": 1745 + }, + { + "epoch": 2.1645021645021645, + "grad_norm": 0.5, + "learning_rate": 0.00019880194997104123, + "loss": 0.9687, + "step": 1750 + }, + { + "epoch": 2.170686456400742, + "grad_norm": 0.75390625, + "learning_rate": 0.00019877962351890993, + "loss": 0.9574, + "step": 1755 + }, + { + "epoch": 2.17687074829932, + "grad_norm": 0.6328125, + "learning_rate": 0.00019875709222582594, + "loss": 0.9632, + "step": 1760 + }, + { + "epoch": 2.1830550401978974, + "grad_norm": 0.478515625, + "learning_rate": 0.00019873435613851275, + "loss": 0.9643, + "step": 1765 + }, + { + "epoch": 2.189239332096475, + "grad_norm": 0.435546875, + "learning_rate": 0.00019871141530411853, + "loss": 0.9664, + "step": 1770 + }, + { + "epoch": 2.1954236239950524, + "grad_norm": 0.431640625, + "learning_rate": 0.00019868826977021615, + "loss": 0.9668, + "step": 1775 + }, + { + "epoch": 2.2016079158936304, + "grad_norm": 0.54296875, + "learning_rate": 0.00019866491958480284, + "loss": 0.9619, + "step": 1780 + }, + { + "epoch": 2.207792207792208, + "grad_norm": 0.55859375, + "learning_rate": 0.0001986413647963003, + "loss": 0.9564, + "step": 1785 + }, + { + "epoch": 2.2139764996907854, + "grad_norm": 0.478515625, + "learning_rate": 0.00019861760545355442, + "loss": 0.9649, + "step": 1790 + }, + { + "epoch": 2.220160791589363, + "grad_norm": 0.458984375, + "learning_rate": 0.00019859364160583544, + "loss": 0.9453, + "step": 1795 + }, + { + "epoch": 2.226345083487941, + "grad_norm": 0.578125, + "learning_rate": 0.00019856947330283752, + "loss": 0.9669, + "step": 1800 + }, + { + "epoch": 2.2325293753865183, + "grad_norm": 0.43359375, + "learning_rate": 0.0001985451005946789, + "loss": 0.9721, + "step": 1805 + }, + { + "epoch": 2.238713667285096, + "grad_norm": 0.478515625, + "learning_rate": 0.00019852052353190166, + "loss": 0.9657, + "step": 1810 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.53515625, + "learning_rate": 0.00019849574216547171, + "loss": 0.9648, + "step": 1815 + }, + { + "epoch": 2.2510822510822512, + "grad_norm": 0.4765625, + "learning_rate": 0.0001984707565467785, + "loss": 0.9544, + "step": 1820 + }, + { + "epoch": 2.2572665429808287, + "grad_norm": 0.427734375, + "learning_rate": 0.0001984455667276352, + "loss": 0.9737, + "step": 1825 + }, + { + "epoch": 2.2634508348794062, + "grad_norm": 0.50390625, + "learning_rate": 0.00019842017276027832, + "loss": 0.9538, + "step": 1830 + }, + { + "epoch": 2.2696351267779837, + "grad_norm": 0.47265625, + "learning_rate": 0.00019839457469736775, + "loss": 0.9745, + "step": 1835 + }, + { + "epoch": 2.2758194186765617, + "grad_norm": 0.486328125, + "learning_rate": 0.00019836877259198662, + "loss": 0.958, + "step": 1840 + }, + { + "epoch": 2.282003710575139, + "grad_norm": 0.51953125, + "learning_rate": 0.00019834276649764124, + "loss": 0.9637, + "step": 1845 + }, + { + "epoch": 2.2881880024737167, + "grad_norm": 0.490234375, + "learning_rate": 0.0001983165564682608, + "loss": 0.9706, + "step": 1850 + }, + { + "epoch": 2.2943722943722946, + "grad_norm": 0.52734375, + "learning_rate": 0.00019829014255819753, + "loss": 0.9671, + "step": 1855 + }, + { + "epoch": 2.300556586270872, + "grad_norm": 0.61328125, + "learning_rate": 0.00019826352482222638, + "loss": 0.9571, + "step": 1860 + }, + { + "epoch": 2.3067408781694496, + "grad_norm": 0.49609375, + "learning_rate": 0.000198236703315545, + "loss": 0.9581, + "step": 1865 + }, + { + "epoch": 2.312925170068027, + "grad_norm": 0.57421875, + "learning_rate": 0.00019820967809377357, + "loss": 0.9676, + "step": 1870 + }, + { + "epoch": 2.3191094619666046, + "grad_norm": 0.546875, + "learning_rate": 0.0001981824492129548, + "loss": 0.9489, + "step": 1875 + }, + { + "epoch": 2.3252937538651826, + "grad_norm": 0.6640625, + "learning_rate": 0.00019815501672955358, + "loss": 0.9609, + "step": 1880 + }, + { + "epoch": 2.33147804576376, + "grad_norm": 0.4921875, + "learning_rate": 0.0001981273807004572, + "loss": 0.9569, + "step": 1885 + }, + { + "epoch": 2.3376623376623376, + "grad_norm": 0.5, + "learning_rate": 0.0001980995411829749, + "loss": 0.959, + "step": 1890 + }, + { + "epoch": 2.3438466295609155, + "grad_norm": 0.458984375, + "learning_rate": 0.00019807149823483798, + "loss": 0.9614, + "step": 1895 + }, + { + "epoch": 2.350030921459493, + "grad_norm": 0.50390625, + "learning_rate": 0.00019804325191419956, + "loss": 0.9687, + "step": 1900 + }, + { + "epoch": 2.3562152133580705, + "grad_norm": 0.43359375, + "learning_rate": 0.0001980148022796345, + "loss": 0.9598, + "step": 1905 + }, + { + "epoch": 2.362399505256648, + "grad_norm": 0.5390625, + "learning_rate": 0.00019798614939013932, + "loss": 0.9572, + "step": 1910 + }, + { + "epoch": 2.3685837971552255, + "grad_norm": 0.50390625, + "learning_rate": 0.00019795729330513196, + "loss": 0.9586, + "step": 1915 + }, + { + "epoch": 2.3747680890538034, + "grad_norm": 0.515625, + "learning_rate": 0.00019792823408445174, + "loss": 0.9532, + "step": 1920 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.6171875, + "learning_rate": 0.00019789897178835926, + "loss": 0.9444, + "step": 1925 + }, + { + "epoch": 2.3871366728509584, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978695064775363, + "loss": 0.9584, + "step": 1930 + }, + { + "epoch": 2.3933209647495364, + "grad_norm": 0.5078125, + "learning_rate": 0.0001978398382130855, + "loss": 0.9628, + "step": 1935 + }, + { + "epoch": 2.399505256648114, + "grad_norm": 0.6015625, + "learning_rate": 0.00019780996705653044, + "loss": 0.9545, + "step": 1940 + }, + { + "epoch": 2.4056895485466914, + "grad_norm": 0.431640625, + "learning_rate": 0.00019777989306981542, + "loss": 0.9496, + "step": 1945 + }, + { + "epoch": 2.411873840445269, + "grad_norm": 0.59765625, + "learning_rate": 0.00019774961631530545, + "loss": 0.9541, + "step": 1950 + }, + { + "epoch": 2.418058132343847, + "grad_norm": 0.75, + "learning_rate": 0.00019771913685578585, + "loss": 0.9581, + "step": 1955 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.74609375, + "learning_rate": 0.0001976884547544624, + "loss": 0.9535, + "step": 1960 + }, + { + "epoch": 2.430426716141002, + "grad_norm": 0.640625, + "learning_rate": 0.00019765757007496115, + "loss": 0.9595, + "step": 1965 + }, + { + "epoch": 2.4366110080395793, + "grad_norm": 0.54296875, + "learning_rate": 0.0001976264828813281, + "loss": 0.9693, + "step": 1970 + }, + { + "epoch": 2.4427952999381572, + "grad_norm": 0.62890625, + "learning_rate": 0.00019759519323802932, + "loss": 0.9582, + "step": 1975 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 0.462890625, + "learning_rate": 0.00019756370120995066, + "loss": 0.9525, + "step": 1980 + }, + { + "epoch": 2.4551638837353122, + "grad_norm": 0.474609375, + "learning_rate": 0.00019753200686239763, + "loss": 0.95, + "step": 1985 + }, + { + "epoch": 2.4613481756338897, + "grad_norm": 0.62890625, + "learning_rate": 0.0001975001102610954, + "loss": 0.9582, + "step": 1990 + }, + { + "epoch": 2.4675324675324677, + "grad_norm": 0.54296875, + "learning_rate": 0.00019746801147218842, + "loss": 0.9541, + "step": 1995 + }, + { + "epoch": 2.473716759431045, + "grad_norm": 0.58984375, + "learning_rate": 0.0001974357105622405, + "loss": 0.9529, + "step": 2000 + }, + { + "epoch": 2.4799010513296227, + "grad_norm": 0.5078125, + "learning_rate": 0.00019740320759823458, + "loss": 0.9606, + "step": 2005 + }, + { + "epoch": 2.4860853432282, + "grad_norm": 0.46875, + "learning_rate": 0.0001973705026475726, + "loss": 0.9632, + "step": 2010 + }, + { + "epoch": 2.492269635126778, + "grad_norm": 0.56640625, + "learning_rate": 0.00019733759577807538, + "loss": 0.9567, + "step": 2015 + }, + { + "epoch": 2.4984539270253556, + "grad_norm": 0.439453125, + "learning_rate": 0.00019730448705798239, + "loss": 0.9492, + "step": 2020 + }, + { + "epoch": 2.504638218923933, + "grad_norm": 0.498046875, + "learning_rate": 0.0001972711765559518, + "loss": 0.96, + "step": 2025 + }, + { + "epoch": 2.5108225108225106, + "grad_norm": 0.64453125, + "learning_rate": 0.0001972376643410601, + "loss": 0.9524, + "step": 2030 + }, + { + "epoch": 2.5170068027210886, + "grad_norm": 0.470703125, + "learning_rate": 0.00019720395048280215, + "loss": 0.9538, + "step": 2035 + }, + { + "epoch": 2.523191094619666, + "grad_norm": 0.478515625, + "learning_rate": 0.00019717003505109095, + "loss": 0.9492, + "step": 2040 + }, + { + "epoch": 2.5293753865182436, + "grad_norm": 0.453125, + "learning_rate": 0.0001971359181162575, + "loss": 0.9496, + "step": 2045 + }, + { + "epoch": 2.5355596784168215, + "grad_norm": 0.443359375, + "learning_rate": 0.00019710159974905064, + "loss": 0.9625, + "step": 2050 + }, + { + "epoch": 2.541743970315399, + "grad_norm": 0.482421875, + "learning_rate": 0.00019706708002063694, + "loss": 0.9555, + "step": 2055 + }, + { + "epoch": 2.5479282622139765, + "grad_norm": 0.4453125, + "learning_rate": 0.00019703235900260055, + "loss": 0.9655, + "step": 2060 + }, + { + "epoch": 2.554112554112554, + "grad_norm": 0.443359375, + "learning_rate": 0.00019699743676694303, + "loss": 0.9554, + "step": 2065 + }, + { + "epoch": 2.5602968460111315, + "grad_norm": 0.474609375, + "learning_rate": 0.00019696231338608316, + "loss": 0.9522, + "step": 2070 + }, + { + "epoch": 2.5664811379097094, + "grad_norm": 0.60546875, + "learning_rate": 0.00019692698893285693, + "loss": 0.952, + "step": 2075 + }, + { + "epoch": 2.572665429808287, + "grad_norm": 0.43359375, + "learning_rate": 0.00019689146348051719, + "loss": 0.9524, + "step": 2080 + }, + { + "epoch": 2.5788497217068644, + "grad_norm": 0.5078125, + "learning_rate": 0.00019685573710273376, + "loss": 0.9523, + "step": 2085 + }, + { + "epoch": 2.5850340136054424, + "grad_norm": 0.51953125, + "learning_rate": 0.0001968198098735929, + "loss": 0.9491, + "step": 2090 + }, + { + "epoch": 2.59121830550402, + "grad_norm": 0.40234375, + "learning_rate": 0.0001967836818675976, + "loss": 0.9496, + "step": 2095 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 0.470703125, + "learning_rate": 0.0001967473531596671, + "loss": 0.9408, + "step": 2100 + }, + { + "epoch": 2.603586889301175, + "grad_norm": 0.45703125, + "learning_rate": 0.0001967108238251368, + "loss": 0.9526, + "step": 2105 + }, + { + "epoch": 2.6097711811997524, + "grad_norm": 0.462890625, + "learning_rate": 0.00019667409393975822, + "loss": 0.9476, + "step": 2110 + }, + { + "epoch": 2.6159554730983303, + "grad_norm": 0.60546875, + "learning_rate": 0.00019663716357969874, + "loss": 0.9495, + "step": 2115 + }, + { + "epoch": 2.622139764996908, + "grad_norm": 0.51953125, + "learning_rate": 0.00019660003282154147, + "loss": 0.945, + "step": 2120 + }, + { + "epoch": 2.6283240568954853, + "grad_norm": 0.421875, + "learning_rate": 0.00019656270174228503, + "loss": 0.9506, + "step": 2125 + }, + { + "epoch": 2.6345083487940633, + "grad_norm": 0.427734375, + "learning_rate": 0.00019652517041934356, + "loss": 0.9483, + "step": 2130 + }, + { + "epoch": 2.6406926406926408, + "grad_norm": 0.458984375, + "learning_rate": 0.0001964874389305464, + "loss": 0.948, + "step": 2135 + }, + { + "epoch": 2.6468769325912183, + "grad_norm": 0.625, + "learning_rate": 0.00019644950735413788, + "loss": 0.9464, + "step": 2140 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.66015625, + "learning_rate": 0.00019641137576877744, + "loss": 0.944, + "step": 2145 + }, + { + "epoch": 2.6592455163883733, + "grad_norm": 0.54296875, + "learning_rate": 0.00019637304425353916, + "loss": 0.9437, + "step": 2150 + }, + { + "epoch": 2.665429808286951, + "grad_norm": 0.56640625, + "learning_rate": 0.00019633451288791166, + "loss": 0.9494, + "step": 2155 + }, + { + "epoch": 2.6716141001855287, + "grad_norm": 0.6328125, + "learning_rate": 0.0001962957817517982, + "loss": 0.9497, + "step": 2160 + }, + { + "epoch": 2.6777983920841066, + "grad_norm": 0.470703125, + "learning_rate": 0.00019625685092551612, + "loss": 0.9436, + "step": 2165 + }, + { + "epoch": 2.683982683982684, + "grad_norm": 0.5390625, + "learning_rate": 0.0001962177204897969, + "loss": 0.9509, + "step": 2170 + }, + { + "epoch": 2.6901669758812616, + "grad_norm": 0.515625, + "learning_rate": 0.00019617839052578603, + "loss": 0.9474, + "step": 2175 + }, + { + "epoch": 2.696351267779839, + "grad_norm": 0.62890625, + "learning_rate": 0.0001961388611150427, + "loss": 0.942, + "step": 2180 + }, + { + "epoch": 2.7025355596784166, + "grad_norm": 0.58203125, + "learning_rate": 0.00019609913233953967, + "loss": 0.9558, + "step": 2185 + }, + { + "epoch": 2.7087198515769946, + "grad_norm": 0.453125, + "learning_rate": 0.00019605920428166323, + "loss": 0.9616, + "step": 2190 + }, + { + "epoch": 2.714904143475572, + "grad_norm": 0.4453125, + "learning_rate": 0.0001960190770242128, + "loss": 0.9594, + "step": 2195 + }, + { + "epoch": 2.7210884353741496, + "grad_norm": 0.44140625, + "learning_rate": 0.00019597875065040094, + "loss": 0.9455, + "step": 2200 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.48828125, + "learning_rate": 0.00019593822524385316, + "loss": 0.9458, + "step": 2205 + }, + { + "epoch": 2.733457019171305, + "grad_norm": 0.435546875, + "learning_rate": 0.00019589750088860766, + "loss": 0.9468, + "step": 2210 + }, + { + "epoch": 2.7396413110698825, + "grad_norm": 0.50390625, + "learning_rate": 0.00019585657766911524, + "loss": 0.9448, + "step": 2215 + }, + { + "epoch": 2.74582560296846, + "grad_norm": 0.58203125, + "learning_rate": 0.000195815455670239, + "loss": 0.9415, + "step": 2220 + }, + { + "epoch": 2.7520098948670375, + "grad_norm": 0.419921875, + "learning_rate": 0.00019577413497725438, + "loss": 0.9419, + "step": 2225 + }, + { + "epoch": 2.7581941867656155, + "grad_norm": 0.5078125, + "learning_rate": 0.00019573261567584874, + "loss": 0.9409, + "step": 2230 + }, + { + "epoch": 2.764378478664193, + "grad_norm": 0.46875, + "learning_rate": 0.0001956908978521214, + "loss": 0.9544, + "step": 2235 + }, + { + "epoch": 2.7705627705627704, + "grad_norm": 0.71875, + "learning_rate": 0.00019564898159258324, + "loss": 0.9327, + "step": 2240 + }, + { + "epoch": 2.7767470624613484, + "grad_norm": 0.578125, + "learning_rate": 0.00019560686698415677, + "loss": 0.9478, + "step": 2245 + }, + { + "epoch": 2.782931354359926, + "grad_norm": 0.462890625, + "learning_rate": 0.00019556455411417573, + "loss": 0.9384, + "step": 2250 + }, + { + "epoch": 2.7891156462585034, + "grad_norm": 0.48046875, + "learning_rate": 0.00019552204307038502, + "loss": 0.9451, + "step": 2255 + }, + { + "epoch": 2.795299938157081, + "grad_norm": 0.51953125, + "learning_rate": 0.0001954793339409405, + "loss": 0.9485, + "step": 2260 + }, + { + "epoch": 2.8014842300556584, + "grad_norm": 0.5625, + "learning_rate": 0.0001954364268144088, + "loss": 0.9571, + "step": 2265 + }, + { + "epoch": 2.8076685219542363, + "grad_norm": 0.46875, + "learning_rate": 0.00019539332177976714, + "loss": 0.9504, + "step": 2270 + }, + { + "epoch": 2.813852813852814, + "grad_norm": 0.50390625, + "learning_rate": 0.00019535001892640317, + "loss": 0.9422, + "step": 2275 + }, + { + "epoch": 2.8200371057513913, + "grad_norm": 0.45703125, + "learning_rate": 0.00019530651834411474, + "loss": 0.9473, + "step": 2280 + }, + { + "epoch": 2.8262213976499693, + "grad_norm": 0.4375, + "learning_rate": 0.00019526282012310975, + "loss": 0.9467, + "step": 2285 + }, + { + "epoch": 2.8324056895485468, + "grad_norm": 0.455078125, + "learning_rate": 0.00019521892435400587, + "loss": 0.955, + "step": 2290 + }, + { + "epoch": 2.8385899814471243, + "grad_norm": 0.462890625, + "learning_rate": 0.00019517483112783054, + "loss": 0.9507, + "step": 2295 + }, + { + "epoch": 2.8447742733457018, + "grad_norm": 0.47265625, + "learning_rate": 0.00019513054053602055, + "loss": 0.9447, + "step": 2300 + }, + { + "epoch": 2.8509585652442793, + "grad_norm": 0.46484375, + "learning_rate": 0.00019508605267042214, + "loss": 0.9553, + "step": 2305 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.44140625, + "learning_rate": 0.00019504136762329047, + "loss": 0.9454, + "step": 2310 + }, + { + "epoch": 2.8633271490414347, + "grad_norm": 0.55859375, + "learning_rate": 0.00019499648548728965, + "loss": 0.9529, + "step": 2315 + }, + { + "epoch": 2.869511440940012, + "grad_norm": 0.57421875, + "learning_rate": 0.00019495140635549261, + "loss": 0.9455, + "step": 2320 + }, + { + "epoch": 2.87569573283859, + "grad_norm": 0.431640625, + "learning_rate": 0.00019490613032138062, + "loss": 0.9468, + "step": 2325 + }, + { + "epoch": 2.8818800247371676, + "grad_norm": 0.515625, + "learning_rate": 0.00019486065747884333, + "loss": 0.9527, + "step": 2330 + }, + { + "epoch": 2.888064316635745, + "grad_norm": 0.451171875, + "learning_rate": 0.0001948149879221786, + "loss": 0.9526, + "step": 2335 + }, + { + "epoch": 2.8942486085343226, + "grad_norm": 0.48046875, + "learning_rate": 0.0001947691217460921, + "loss": 0.9494, + "step": 2340 + }, + { + "epoch": 2.9004329004329006, + "grad_norm": 0.52734375, + "learning_rate": 0.00019472305904569729, + "loss": 0.9309, + "step": 2345 + }, + { + "epoch": 2.906617192331478, + "grad_norm": 0.44921875, + "learning_rate": 0.0001946767999165152, + "loss": 0.9505, + "step": 2350 + }, + { + "epoch": 2.9128014842300556, + "grad_norm": 0.5, + "learning_rate": 0.0001946303444544741, + "loss": 0.9355, + "step": 2355 + }, + { + "epoch": 2.9189857761286335, + "grad_norm": 0.4140625, + "learning_rate": 0.00019458369275590954, + "loss": 0.9423, + "step": 2360 + }, + { + "epoch": 2.925170068027211, + "grad_norm": 0.470703125, + "learning_rate": 0.00019453684491756382, + "loss": 0.9392, + "step": 2365 + }, + { + "epoch": 2.9313543599257885, + "grad_norm": 0.4609375, + "learning_rate": 0.00019448980103658613, + "loss": 0.9435, + "step": 2370 + }, + { + "epoch": 2.937538651824366, + "grad_norm": 0.6328125, + "learning_rate": 0.00019444256121053217, + "loss": 0.9505, + "step": 2375 + }, + { + "epoch": 2.9437229437229435, + "grad_norm": 0.58984375, + "learning_rate": 0.00019439512553736394, + "loss": 0.945, + "step": 2380 + }, + { + "epoch": 2.9499072356215215, + "grad_norm": 0.5, + "learning_rate": 0.00019434749411544958, + "loss": 0.9408, + "step": 2385 + }, + { + "epoch": 2.956091527520099, + "grad_norm": 0.48046875, + "learning_rate": 0.0001942996670435632, + "loss": 0.9521, + "step": 2390 + }, + { + "epoch": 2.9622758194186765, + "grad_norm": 0.5625, + "learning_rate": 0.00019425164442088451, + "loss": 0.9453, + "step": 2395 + }, + { + "epoch": 2.9684601113172544, + "grad_norm": 0.466796875, + "learning_rate": 0.0001942034263469989, + "loss": 0.9481, + "step": 2400 + }, + { + "epoch": 2.974644403215832, + "grad_norm": 0.478515625, + "learning_rate": 0.000194155012921897, + "loss": 0.9466, + "step": 2405 + }, + { + "epoch": 2.9808286951144094, + "grad_norm": 0.474609375, + "learning_rate": 0.0001941064042459745, + "loss": 0.9376, + "step": 2410 + }, + { + "epoch": 2.987012987012987, + "grad_norm": 0.49609375, + "learning_rate": 0.00019405760042003203, + "loss": 0.9422, + "step": 2415 + }, + { + "epoch": 2.9931972789115644, + "grad_norm": 0.51171875, + "learning_rate": 0.00019400860154527493, + "loss": 0.9541, + "step": 2420 + }, + { + "epoch": 2.9993815708101423, + "grad_norm": 0.484375, + "learning_rate": 0.0001939594077233129, + "loss": 0.9526, + "step": 2425 + }, + { + "epoch": 2.9993815708101423, + "eval_loss": 2.465909481048584, + "eval_runtime": 0.6411, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.56, + "step": 2425 + }, + { + "epoch": 3.00556586270872, + "grad_norm": 0.462890625, + "learning_rate": 0.0001939100190561601, + "loss": 0.935, + "step": 2430 + }, + { + "epoch": 3.0117501546072973, + "grad_norm": 0.484375, + "learning_rate": 0.00019386043564623452, + "loss": 0.9371, + "step": 2435 + }, + { + "epoch": 3.0179344465058753, + "grad_norm": 0.5234375, + "learning_rate": 0.00019381065759635822, + "loss": 0.932, + "step": 2440 + }, + { + "epoch": 3.0241187384044528, + "grad_norm": 0.435546875, + "learning_rate": 0.00019376068500975667, + "loss": 0.93, + "step": 2445 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.4765625, + "learning_rate": 0.0001937105179900589, + "loss": 0.9412, + "step": 2450 + }, + { + "epoch": 3.0364873222016078, + "grad_norm": 0.53125, + "learning_rate": 0.00019366015664129714, + "loss": 0.9272, + "step": 2455 + }, + { + "epoch": 3.0426716141001857, + "grad_norm": 0.5, + "learning_rate": 0.00019360960106790643, + "loss": 0.9351, + "step": 2460 + }, + { + "epoch": 3.048855905998763, + "grad_norm": 0.412109375, + "learning_rate": 0.00019355885137472488, + "loss": 0.9307, + "step": 2465 + }, + { + "epoch": 3.0550401978973407, + "grad_norm": 0.5703125, + "learning_rate": 0.00019350790766699282, + "loss": 0.9332, + "step": 2470 + }, + { + "epoch": 3.061224489795918, + "grad_norm": 0.6484375, + "learning_rate": 0.00019345677005035315, + "loss": 0.9351, + "step": 2475 + }, + { + "epoch": 3.067408781694496, + "grad_norm": 0.50390625, + "learning_rate": 0.0001934054386308508, + "loss": 0.9241, + "step": 2480 + }, + { + "epoch": 3.0735930735930737, + "grad_norm": 0.49609375, + "learning_rate": 0.00019335391351493257, + "loss": 0.9182, + "step": 2485 + }, + { + "epoch": 3.079777365491651, + "grad_norm": 0.43359375, + "learning_rate": 0.00019330219480944694, + "loss": 0.9133, + "step": 2490 + }, + { + "epoch": 3.0859616573902287, + "grad_norm": 0.4609375, + "learning_rate": 0.00019325028262164384, + "loss": 0.9386, + "step": 2495 + }, + { + "epoch": 3.0921459492888066, + "grad_norm": 0.439453125, + "learning_rate": 0.0001931981770591745, + "loss": 0.9302, + "step": 2500 + }, + { + "epoch": 3.098330241187384, + "grad_norm": 0.421875, + "learning_rate": 0.00019314587823009103, + "loss": 0.9421, + "step": 2505 + }, + { + "epoch": 3.1045145330859616, + "grad_norm": 0.486328125, + "learning_rate": 0.00019309338624284644, + "loss": 0.9286, + "step": 2510 + }, + { + "epoch": 3.110698824984539, + "grad_norm": 0.50390625, + "learning_rate": 0.0001930407012062942, + "loss": 0.9291, + "step": 2515 + }, + { + "epoch": 3.116883116883117, + "grad_norm": 0.431640625, + "learning_rate": 0.00019298782322968815, + "loss": 0.9332, + "step": 2520 + }, + { + "epoch": 3.1230674087816945, + "grad_norm": 0.43359375, + "learning_rate": 0.00019293475242268223, + "loss": 0.9427, + "step": 2525 + }, + { + "epoch": 3.129251700680272, + "grad_norm": 0.5625, + "learning_rate": 0.0001928814888953303, + "loss": 0.9277, + "step": 2530 + }, + { + "epoch": 3.1354359925788495, + "grad_norm": 0.4140625, + "learning_rate": 0.0001928280327580858, + "loss": 0.9286, + "step": 2535 + }, + { + "epoch": 3.1416202844774275, + "grad_norm": 0.431640625, + "learning_rate": 0.0001927743841218016, + "loss": 0.9214, + "step": 2540 + }, + { + "epoch": 3.147804576376005, + "grad_norm": 0.66796875, + "learning_rate": 0.00019272054309772978, + "loss": 0.9328, + "step": 2545 + }, + { + "epoch": 3.1539888682745825, + "grad_norm": 0.5390625, + "learning_rate": 0.00019266650979752136, + "loss": 0.9427, + "step": 2550 + }, + { + "epoch": 3.16017316017316, + "grad_norm": 0.69921875, + "learning_rate": 0.0001926122843332261, + "loss": 0.9285, + "step": 2555 + }, + { + "epoch": 3.166357452071738, + "grad_norm": 0.58984375, + "learning_rate": 0.00019255786681729225, + "loss": 0.9344, + "step": 2560 + }, + { + "epoch": 3.1725417439703154, + "grad_norm": 0.56640625, + "learning_rate": 0.00019250325736256633, + "loss": 0.9332, + "step": 2565 + }, + { + "epoch": 3.178726035868893, + "grad_norm": 0.51171875, + "learning_rate": 0.00019244845608229293, + "loss": 0.9357, + "step": 2570 + }, + { + "epoch": 3.1849103277674704, + "grad_norm": 0.4765625, + "learning_rate": 0.00019239346309011426, + "loss": 0.937, + "step": 2575 + }, + { + "epoch": 3.1910946196660483, + "grad_norm": 0.46875, + "learning_rate": 0.00019233827850007027, + "loss": 0.9332, + "step": 2580 + }, + { + "epoch": 3.197278911564626, + "grad_norm": 0.55859375, + "learning_rate": 0.00019228290242659816, + "loss": 0.937, + "step": 2585 + }, + { + "epoch": 3.2034632034632033, + "grad_norm": 0.498046875, + "learning_rate": 0.00019222733498453222, + "loss": 0.9214, + "step": 2590 + }, + { + "epoch": 3.2096474953617813, + "grad_norm": 0.546875, + "learning_rate": 0.0001921715762891036, + "loss": 0.9329, + "step": 2595 + }, + { + "epoch": 3.215831787260359, + "grad_norm": 0.53515625, + "learning_rate": 0.00019211562645594002, + "loss": 0.9317, + "step": 2600 + }, + { + "epoch": 3.2220160791589363, + "grad_norm": 0.498046875, + "learning_rate": 0.00019205948560106556, + "loss": 0.9232, + "step": 2605 + }, + { + "epoch": 3.228200371057514, + "grad_norm": 0.51953125, + "learning_rate": 0.00019200315384090044, + "loss": 0.9325, + "step": 2610 + }, + { + "epoch": 3.2343846629560917, + "grad_norm": 0.51953125, + "learning_rate": 0.00019194663129226084, + "loss": 0.9319, + "step": 2615 + }, + { + "epoch": 3.2405689548546692, + "grad_norm": 0.52734375, + "learning_rate": 0.00019188991807235844, + "loss": 0.9368, + "step": 2620 + }, + { + "epoch": 3.2467532467532467, + "grad_norm": 0.51171875, + "learning_rate": 0.00019183301429880043, + "loss": 0.9295, + "step": 2625 + }, + { + "epoch": 3.2529375386518242, + "grad_norm": 0.43359375, + "learning_rate": 0.0001917759200895891, + "loss": 0.9421, + "step": 2630 + }, + { + "epoch": 3.259121830550402, + "grad_norm": 0.462890625, + "learning_rate": 0.00019171863556312167, + "loss": 0.9335, + "step": 2635 + }, + { + "epoch": 3.2653061224489797, + "grad_norm": 0.5234375, + "learning_rate": 0.00019166116083819002, + "loss": 0.9317, + "step": 2640 + }, + { + "epoch": 3.271490414347557, + "grad_norm": 0.4609375, + "learning_rate": 0.00019160349603398043, + "loss": 0.9297, + "step": 2645 + }, + { + "epoch": 3.2776747062461347, + "grad_norm": 0.55078125, + "learning_rate": 0.00019154564127007336, + "loss": 0.9371, + "step": 2650 + }, + { + "epoch": 3.2838589981447126, + "grad_norm": 0.59375, + "learning_rate": 0.00019148759666644325, + "loss": 0.9286, + "step": 2655 + }, + { + "epoch": 3.29004329004329, + "grad_norm": 0.396484375, + "learning_rate": 0.0001914293623434581, + "loss": 0.9404, + "step": 2660 + }, + { + "epoch": 3.2962275819418676, + "grad_norm": 0.5234375, + "learning_rate": 0.00019137093842187944, + "loss": 0.9292, + "step": 2665 + }, + { + "epoch": 3.302411873840445, + "grad_norm": 0.439453125, + "learning_rate": 0.00019131232502286188, + "loss": 0.9224, + "step": 2670 + }, + { + "epoch": 3.308596165739023, + "grad_norm": 0.4375, + "learning_rate": 0.00019125352226795307, + "loss": 0.9313, + "step": 2675 + }, + { + "epoch": 3.3147804576376005, + "grad_norm": 0.4921875, + "learning_rate": 0.00019119453027909323, + "loss": 0.928, + "step": 2680 + }, + { + "epoch": 3.320964749536178, + "grad_norm": 0.412109375, + "learning_rate": 0.00019113534917861502, + "loss": 0.9357, + "step": 2685 + }, + { + "epoch": 3.3271490414347555, + "grad_norm": 0.421875, + "learning_rate": 0.0001910759790892433, + "loss": 0.9339, + "step": 2690 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.470703125, + "learning_rate": 0.0001910164201340948, + "loss": 0.9342, + "step": 2695 + }, + { + "epoch": 3.339517625231911, + "grad_norm": 0.41796875, + "learning_rate": 0.0001909566724366779, + "loss": 0.9217, + "step": 2700 + }, + { + "epoch": 3.3457019171304885, + "grad_norm": 0.5, + "learning_rate": 0.00019089673612089243, + "loss": 0.9239, + "step": 2705 + }, + { + "epoch": 3.3518862090290664, + "grad_norm": 0.546875, + "learning_rate": 0.00019083661131102933, + "loss": 0.9289, + "step": 2710 + }, + { + "epoch": 3.358070500927644, + "grad_norm": 0.49609375, + "learning_rate": 0.00019077629813177036, + "loss": 0.925, + "step": 2715 + }, + { + "epoch": 3.3642547928262214, + "grad_norm": 0.43359375, + "learning_rate": 0.00019071579670818808, + "loss": 0.9251, + "step": 2720 + }, + { + "epoch": 3.370439084724799, + "grad_norm": 0.52734375, + "learning_rate": 0.00019065510716574516, + "loss": 0.9272, + "step": 2725 + }, + { + "epoch": 3.3766233766233764, + "grad_norm": 0.4375, + "learning_rate": 0.00019059422963029464, + "loss": 0.9264, + "step": 2730 + }, + { + "epoch": 3.3828076685219544, + "grad_norm": 0.53515625, + "learning_rate": 0.00019053316422807922, + "loss": 0.9169, + "step": 2735 + }, + { + "epoch": 3.388991960420532, + "grad_norm": 0.4609375, + "learning_rate": 0.00019047191108573125, + "loss": 0.9299, + "step": 2740 + }, + { + "epoch": 3.3951762523191094, + "grad_norm": 0.439453125, + "learning_rate": 0.00019041047033027236, + "loss": 0.9293, + "step": 2745 + }, + { + "epoch": 3.4013605442176873, + "grad_norm": 0.4921875, + "learning_rate": 0.00019034884208911335, + "loss": 0.9163, + "step": 2750 + }, + { + "epoch": 3.407544836116265, + "grad_norm": 0.466796875, + "learning_rate": 0.00019028702649005364, + "loss": 0.927, + "step": 2755 + }, + { + "epoch": 3.4137291280148423, + "grad_norm": 0.45703125, + "learning_rate": 0.00019022502366128135, + "loss": 0.9231, + "step": 2760 + }, + { + "epoch": 3.41991341991342, + "grad_norm": 0.427734375, + "learning_rate": 0.00019016283373137274, + "loss": 0.9216, + "step": 2765 + }, + { + "epoch": 3.4260977118119973, + "grad_norm": 0.56640625, + "learning_rate": 0.00019010045682929213, + "loss": 0.9191, + "step": 2770 + }, + { + "epoch": 3.4322820037105752, + "grad_norm": 0.53125, + "learning_rate": 0.00019003789308439148, + "loss": 0.9167, + "step": 2775 + }, + { + "epoch": 3.4384662956091527, + "grad_norm": 0.46875, + "learning_rate": 0.00018997514262641035, + "loss": 0.9203, + "step": 2780 + }, + { + "epoch": 3.4446505875077302, + "grad_norm": 0.48046875, + "learning_rate": 0.00018991220558547533, + "loss": 0.9316, + "step": 2785 + }, + { + "epoch": 3.450834879406308, + "grad_norm": 0.51953125, + "learning_rate": 0.0001898490820921001, + "loss": 0.9255, + "step": 2790 + }, + { + "epoch": 3.4570191713048857, + "grad_norm": 0.474609375, + "learning_rate": 0.00018978577227718484, + "loss": 0.9166, + "step": 2795 + }, + { + "epoch": 3.463203463203463, + "grad_norm": 0.40234375, + "learning_rate": 0.00018972227627201617, + "loss": 0.928, + "step": 2800 + }, + { + "epoch": 3.4693877551020407, + "grad_norm": 0.45703125, + "learning_rate": 0.00018965859420826684, + "loss": 0.9247, + "step": 2805 + }, + { + "epoch": 3.4755720470006186, + "grad_norm": 0.40625, + "learning_rate": 0.0001895947262179954, + "loss": 0.9221, + "step": 2810 + }, + { + "epoch": 3.481756338899196, + "grad_norm": 0.45703125, + "learning_rate": 0.00018953067243364594, + "loss": 0.9183, + "step": 2815 + }, + { + "epoch": 3.4879406307977736, + "grad_norm": 0.46484375, + "learning_rate": 0.00018946643298804793, + "loss": 0.925, + "step": 2820 + }, + { + "epoch": 3.494124922696351, + "grad_norm": 0.46484375, + "learning_rate": 0.0001894020080144157, + "loss": 0.9341, + "step": 2825 + }, + { + "epoch": 3.500309214594929, + "grad_norm": 0.59765625, + "learning_rate": 0.00018933739764634847, + "loss": 0.9188, + "step": 2830 + }, + { + "epoch": 3.5064935064935066, + "grad_norm": 0.62890625, + "learning_rate": 0.00018927260201782978, + "loss": 0.9297, + "step": 2835 + }, + { + "epoch": 3.512677798392084, + "grad_norm": 0.6171875, + "learning_rate": 0.0001892076212632274, + "loss": 0.9197, + "step": 2840 + }, + { + "epoch": 3.5188620902906615, + "grad_norm": 0.56640625, + "learning_rate": 0.00018914245551729305, + "loss": 0.9139, + "step": 2845 + }, + { + "epoch": 3.5250463821892395, + "grad_norm": 0.61328125, + "learning_rate": 0.00018907710491516199, + "loss": 0.9252, + "step": 2850 + }, + { + "epoch": 3.531230674087817, + "grad_norm": 0.482421875, + "learning_rate": 0.00018901156959235285, + "loss": 0.926, + "step": 2855 + }, + { + "epoch": 3.5374149659863945, + "grad_norm": 0.478515625, + "learning_rate": 0.00018894584968476733, + "loss": 0.926, + "step": 2860 + }, + { + "epoch": 3.5435992578849724, + "grad_norm": 0.5625, + "learning_rate": 0.0001888799453286899, + "loss": 0.9311, + "step": 2865 + }, + { + "epoch": 3.54978354978355, + "grad_norm": 0.50390625, + "learning_rate": 0.00018881385666078755, + "loss": 0.9264, + "step": 2870 + }, + { + "epoch": 3.5559678416821274, + "grad_norm": 0.490234375, + "learning_rate": 0.00018874758381810943, + "loss": 0.9353, + "step": 2875 + }, + { + "epoch": 3.562152133580705, + "grad_norm": 0.490234375, + "learning_rate": 0.00018868112693808665, + "loss": 0.931, + "step": 2880 + }, + { + "epoch": 3.5683364254792824, + "grad_norm": 0.4921875, + "learning_rate": 0.0001886144861585319, + "loss": 0.9156, + "step": 2885 + }, + { + "epoch": 3.5745207173778604, + "grad_norm": 0.455078125, + "learning_rate": 0.00018854766161763932, + "loss": 0.933, + "step": 2890 + }, + { + "epoch": 3.580705009276438, + "grad_norm": 0.60546875, + "learning_rate": 0.0001884806534539841, + "loss": 0.9268, + "step": 2895 + }, + { + "epoch": 3.5868893011750154, + "grad_norm": 0.60546875, + "learning_rate": 0.00018841346180652213, + "loss": 0.92, + "step": 2900 + }, + { + "epoch": 3.5930735930735933, + "grad_norm": 0.49609375, + "learning_rate": 0.00018834608681458988, + "loss": 0.9125, + "step": 2905 + }, + { + "epoch": 3.599257884972171, + "grad_norm": 0.5390625, + "learning_rate": 0.00018827852861790398, + "loss": 0.9187, + "step": 2910 + }, + { + "epoch": 3.6054421768707483, + "grad_norm": 0.50390625, + "learning_rate": 0.00018821078735656101, + "loss": 0.9253, + "step": 2915 + }, + { + "epoch": 3.611626468769326, + "grad_norm": 0.455078125, + "learning_rate": 0.00018814286317103714, + "loss": 0.9273, + "step": 2920 + }, + { + "epoch": 3.6178107606679033, + "grad_norm": 0.46484375, + "learning_rate": 0.00018807475620218788, + "loss": 0.9167, + "step": 2925 + }, + { + "epoch": 3.6239950525664812, + "grad_norm": 0.4609375, + "learning_rate": 0.00018800646659124782, + "loss": 0.9192, + "step": 2930 + }, + { + "epoch": 3.6301793444650587, + "grad_norm": 0.419921875, + "learning_rate": 0.00018793799447983025, + "loss": 0.9288, + "step": 2935 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.4375, + "learning_rate": 0.00018786934000992688, + "loss": 0.9198, + "step": 2940 + }, + { + "epoch": 3.642547928262214, + "grad_norm": 0.6875, + "learning_rate": 0.00018780050332390768, + "loss": 0.9157, + "step": 2945 + }, + { + "epoch": 3.6487322201607917, + "grad_norm": 0.47265625, + "learning_rate": 0.00018773148456452046, + "loss": 0.9139, + "step": 2950 + }, + { + "epoch": 3.654916512059369, + "grad_norm": 0.50390625, + "learning_rate": 0.00018766228387489048, + "loss": 0.9218, + "step": 2955 + }, + { + "epoch": 3.6611008039579467, + "grad_norm": 0.48828125, + "learning_rate": 0.00018759290139852048, + "loss": 0.9315, + "step": 2960 + }, + { + "epoch": 3.667285095856524, + "grad_norm": 0.53515625, + "learning_rate": 0.00018752333727928993, + "loss": 0.9291, + "step": 2965 + }, + { + "epoch": 3.673469387755102, + "grad_norm": 0.443359375, + "learning_rate": 0.00018745359166145523, + "loss": 0.915, + "step": 2970 + }, + { + "epoch": 3.6796536796536796, + "grad_norm": 0.47265625, + "learning_rate": 0.00018738366468964898, + "loss": 0.9188, + "step": 2975 + }, + { + "epoch": 3.685837971552257, + "grad_norm": 0.5234375, + "learning_rate": 0.00018731355650887985, + "loss": 0.917, + "step": 2980 + }, + { + "epoch": 3.692022263450835, + "grad_norm": 0.490234375, + "learning_rate": 0.00018724326726453244, + "loss": 0.9351, + "step": 2985 + }, + { + "epoch": 3.6982065553494126, + "grad_norm": 0.5078125, + "learning_rate": 0.00018717279710236666, + "loss": 0.9178, + "step": 2990 + }, + { + "epoch": 3.70439084724799, + "grad_norm": 0.4765625, + "learning_rate": 0.0001871021461685177, + "loss": 0.9245, + "step": 2995 + }, + { + "epoch": 3.7105751391465676, + "grad_norm": 0.41796875, + "learning_rate": 0.00018703131460949554, + "loss": 0.9176, + "step": 3000 + }, + { + "epoch": 3.716759431045145, + "grad_norm": 0.5859375, + "learning_rate": 0.0001869603025721848, + "loss": 0.9334, + "step": 3005 + }, + { + "epoch": 3.722943722943723, + "grad_norm": 0.482421875, + "learning_rate": 0.00018688911020384432, + "loss": 0.9213, + "step": 3010 + }, + { + "epoch": 3.7291280148423005, + "grad_norm": 0.431640625, + "learning_rate": 0.0001868177376521069, + "loss": 0.9271, + "step": 3015 + }, + { + "epoch": 3.7353123067408784, + "grad_norm": 0.48046875, + "learning_rate": 0.000186746185064979, + "loss": 0.9239, + "step": 3020 + }, + { + "epoch": 3.741496598639456, + "grad_norm": 0.4453125, + "learning_rate": 0.00018667445259084036, + "loss": 0.9201, + "step": 3025 + }, + { + "epoch": 3.7476808905380334, + "grad_norm": 0.412109375, + "learning_rate": 0.00018660254037844388, + "loss": 0.9236, + "step": 3030 + }, + { + "epoch": 3.753865182436611, + "grad_norm": 0.40234375, + "learning_rate": 0.00018653044857691508, + "loss": 0.9212, + "step": 3035 + }, + { + "epoch": 3.7600494743351884, + "grad_norm": 0.466796875, + "learning_rate": 0.00018645817733575193, + "loss": 0.9157, + "step": 3040 + }, + { + "epoch": 3.7662337662337664, + "grad_norm": 0.5078125, + "learning_rate": 0.00018638572680482448, + "loss": 0.9181, + "step": 3045 + }, + { + "epoch": 3.772418058132344, + "grad_norm": 0.5546875, + "learning_rate": 0.00018631309713437467, + "loss": 0.9329, + "step": 3050 + }, + { + "epoch": 3.7786023500309214, + "grad_norm": 0.6171875, + "learning_rate": 0.00018624028847501585, + "loss": 0.9188, + "step": 3055 + }, + { + "epoch": 3.7847866419294993, + "grad_norm": 0.64453125, + "learning_rate": 0.0001861673009777325, + "loss": 0.9255, + "step": 3060 + }, + { + "epoch": 3.790970933828077, + "grad_norm": 0.83203125, + "learning_rate": 0.00018609413479388003, + "loss": 0.921, + "step": 3065 + }, + { + "epoch": 3.7971552257266543, + "grad_norm": 0.61328125, + "learning_rate": 0.00018602079007518438, + "loss": 0.9226, + "step": 3070 + }, + { + "epoch": 3.803339517625232, + "grad_norm": 0.5625, + "learning_rate": 0.00018594726697374175, + "loss": 0.9174, + "step": 3075 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.55859375, + "learning_rate": 0.00018587356564201817, + "loss": 0.9104, + "step": 3080 + }, + { + "epoch": 3.8157081014223873, + "grad_norm": 0.4609375, + "learning_rate": 0.00018579968623284933, + "loss": 0.9279, + "step": 3085 + }, + { + "epoch": 3.8218923933209648, + "grad_norm": 0.5390625, + "learning_rate": 0.0001857256288994402, + "loss": 0.9153, + "step": 3090 + }, + { + "epoch": 3.8280766852195423, + "grad_norm": 0.478515625, + "learning_rate": 0.00018565139379536473, + "loss": 0.9225, + "step": 3095 + }, + { + "epoch": 3.83426097711812, + "grad_norm": 0.423828125, + "learning_rate": 0.00018557698107456549, + "loss": 0.9158, + "step": 3100 + }, + { + "epoch": 3.8404452690166977, + "grad_norm": 0.4296875, + "learning_rate": 0.00018550239089135334, + "loss": 0.924, + "step": 3105 + }, + { + "epoch": 3.846629560915275, + "grad_norm": 0.40625, + "learning_rate": 0.00018542762340040722, + "loss": 0.92, + "step": 3110 + }, + { + "epoch": 3.8528138528138527, + "grad_norm": 0.4765625, + "learning_rate": 0.00018535267875677372, + "loss": 0.9123, + "step": 3115 + }, + { + "epoch": 3.85899814471243, + "grad_norm": 0.462890625, + "learning_rate": 0.00018527755711586678, + "loss": 0.9152, + "step": 3120 + }, + { + "epoch": 3.865182436611008, + "grad_norm": 0.54296875, + "learning_rate": 0.00018520225863346743, + "loss": 0.9285, + "step": 3125 + }, + { + "epoch": 3.8713667285095856, + "grad_norm": 0.58984375, + "learning_rate": 0.00018512678346572337, + "loss": 0.9203, + "step": 3130 + }, + { + "epoch": 3.877551020408163, + "grad_norm": 0.73828125, + "learning_rate": 0.0001850511317691487, + "loss": 0.9214, + "step": 3135 + }, + { + "epoch": 3.883735312306741, + "grad_norm": 0.66796875, + "learning_rate": 0.00018497530370062363, + "loss": 0.9185, + "step": 3140 + }, + { + "epoch": 3.8899196042053186, + "grad_norm": 0.455078125, + "learning_rate": 0.00018489929941739407, + "loss": 0.917, + "step": 3145 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 0.4375, + "learning_rate": 0.0001848231190770714, + "loss": 0.9221, + "step": 3150 + }, + { + "epoch": 3.9022881880024736, + "grad_norm": 0.45703125, + "learning_rate": 0.00018474676283763205, + "loss": 0.9148, + "step": 3155 + }, + { + "epoch": 3.908472479901051, + "grad_norm": 0.462890625, + "learning_rate": 0.00018467023085741717, + "loss": 0.9258, + "step": 3160 + }, + { + "epoch": 3.914656771799629, + "grad_norm": 0.404296875, + "learning_rate": 0.0001845935232951325, + "loss": 0.9044, + "step": 3165 + }, + { + "epoch": 3.9208410636982065, + "grad_norm": 0.4375, + "learning_rate": 0.00018451664030984773, + "loss": 0.9217, + "step": 3170 + }, + { + "epoch": 3.927025355596784, + "grad_norm": 0.462890625, + "learning_rate": 0.0001844395820609964, + "loss": 0.9186, + "step": 3175 + }, + { + "epoch": 3.933209647495362, + "grad_norm": 0.455078125, + "learning_rate": 0.00018436234870837547, + "loss": 0.9087, + "step": 3180 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.43359375, + "learning_rate": 0.00018428494041214507, + "loss": 0.9143, + "step": 3185 + }, + { + "epoch": 3.945578231292517, + "grad_norm": 0.48828125, + "learning_rate": 0.00018420735733282807, + "loss": 0.923, + "step": 3190 + }, + { + "epoch": 3.9517625231910944, + "grad_norm": 0.4296875, + "learning_rate": 0.00018412959963130975, + "loss": 0.9181, + "step": 3195 + }, + { + "epoch": 3.9579468150896724, + "grad_norm": 0.423828125, + "learning_rate": 0.00018405166746883762, + "loss": 0.9207, + "step": 3200 + }, + { + "epoch": 3.96413110698825, + "grad_norm": 0.47265625, + "learning_rate": 0.00018397356100702085, + "loss": 0.9107, + "step": 3205 + }, + { + "epoch": 3.9703153988868274, + "grad_norm": 0.419921875, + "learning_rate": 0.00018389528040783012, + "loss": 0.9207, + "step": 3210 + }, + { + "epoch": 3.9764996907854053, + "grad_norm": 0.45703125, + "learning_rate": 0.00018381682583359723, + "loss": 0.9226, + "step": 3215 + }, + { + "epoch": 3.982683982683983, + "grad_norm": 0.431640625, + "learning_rate": 0.00018373819744701476, + "loss": 0.9065, + "step": 3220 + }, + { + "epoch": 3.9888682745825603, + "grad_norm": 0.46875, + "learning_rate": 0.00018365939541113566, + "loss": 0.9145, + "step": 3225 + }, + { + "epoch": 3.995052566481138, + "grad_norm": 0.447265625, + "learning_rate": 0.00018358041988937305, + "loss": 0.925, + "step": 3230 + }, + { + "epoch": 4.0, + "eval_loss": 2.475017786026001, + "eval_runtime": 0.5414, + "eval_samples_per_second": 18.47, + "eval_steps_per_second": 1.847, + "step": 3234 + }, + { + "epoch": 4.001236858379715, + "grad_norm": 0.4921875, + "learning_rate": 0.00018350127104549977, + "loss": 0.9148, + "step": 3235 + }, + { + "epoch": 4.007421150278293, + "grad_norm": 0.55078125, + "learning_rate": 0.00018342194904364813, + "loss": 0.9175, + "step": 3240 + }, + { + "epoch": 4.01360544217687, + "grad_norm": 0.51953125, + "learning_rate": 0.00018334245404830944, + "loss": 0.8931, + "step": 3245 + }, + { + "epoch": 4.019789734075449, + "grad_norm": 0.44140625, + "learning_rate": 0.00018326278622433386, + "loss": 0.9231, + "step": 3250 + }, + { + "epoch": 4.025974025974026, + "grad_norm": 0.498046875, + "learning_rate": 0.00018318294573692985, + "loss": 0.9013, + "step": 3255 + }, + { + "epoch": 4.032158317872604, + "grad_norm": 0.443359375, + "learning_rate": 0.00018310293275166392, + "loss": 0.9067, + "step": 3260 + }, + { + "epoch": 4.038342609771181, + "grad_norm": 0.5390625, + "learning_rate": 0.00018302274743446043, + "loss": 0.9111, + "step": 3265 + }, + { + "epoch": 4.044526901669759, + "grad_norm": 0.4296875, + "learning_rate": 0.00018294238995160094, + "loss": 0.9146, + "step": 3270 + }, + { + "epoch": 4.050711193568336, + "grad_norm": 0.5, + "learning_rate": 0.0001828618604697241, + "loss": 0.8986, + "step": 3275 + }, + { + "epoch": 4.056895485466914, + "grad_norm": 0.52734375, + "learning_rate": 0.00018278115915582526, + "loss": 0.8951, + "step": 3280 + }, + { + "epoch": 4.063079777365492, + "grad_norm": 0.45703125, + "learning_rate": 0.00018270028617725607, + "loss": 0.9153, + "step": 3285 + }, + { + "epoch": 4.06926406926407, + "grad_norm": 0.4609375, + "learning_rate": 0.0001826192417017242, + "loss": 0.9018, + "step": 3290 + }, + { + "epoch": 4.075448361162647, + "grad_norm": 0.421875, + "learning_rate": 0.0001825380258972929, + "loss": 0.9111, + "step": 3295 + }, + { + "epoch": 4.081632653061225, + "grad_norm": 0.44921875, + "learning_rate": 0.00018245663893238075, + "loss": 0.9133, + "step": 3300 + }, + { + "epoch": 4.087816944959802, + "grad_norm": 0.462890625, + "learning_rate": 0.00018237508097576123, + "loss": 0.9063, + "step": 3305 + }, + { + "epoch": 4.09400123685838, + "grad_norm": 0.447265625, + "learning_rate": 0.0001822933521965625, + "loss": 0.8988, + "step": 3310 + }, + { + "epoch": 4.100185528756957, + "grad_norm": 0.47265625, + "learning_rate": 0.00018221145276426683, + "loss": 0.9129, + "step": 3315 + }, + { + "epoch": 4.106369820655535, + "grad_norm": 0.412109375, + "learning_rate": 0.00018212938284871047, + "loss": 0.9105, + "step": 3320 + }, + { + "epoch": 4.112554112554113, + "grad_norm": 0.44921875, + "learning_rate": 0.00018204714262008316, + "loss": 0.8963, + "step": 3325 + }, + { + "epoch": 4.1187384044526905, + "grad_norm": 0.46484375, + "learning_rate": 0.00018196473224892784, + "loss": 0.9094, + "step": 3330 + }, + { + "epoch": 4.124922696351268, + "grad_norm": 0.44921875, + "learning_rate": 0.00018188215190614027, + "loss": 0.8976, + "step": 3335 + }, + { + "epoch": 4.1311069882498455, + "grad_norm": 0.50390625, + "learning_rate": 0.0001817994017629687, + "loss": 0.9048, + "step": 3340 + }, + { + "epoch": 4.137291280148423, + "grad_norm": 0.53515625, + "learning_rate": 0.00018171648199101346, + "loss": 0.9181, + "step": 3345 + }, + { + "epoch": 4.1434755720470005, + "grad_norm": 0.515625, + "learning_rate": 0.00018163339276222666, + "loss": 0.9077, + "step": 3350 + }, + { + "epoch": 4.149659863945578, + "grad_norm": 0.59765625, + "learning_rate": 0.00018155013424891184, + "loss": 0.9036, + "step": 3355 + }, + { + "epoch": 4.1558441558441555, + "grad_norm": 0.54296875, + "learning_rate": 0.00018146670662372354, + "loss": 0.9122, + "step": 3360 + }, + { + "epoch": 4.162028447742734, + "grad_norm": 0.5546875, + "learning_rate": 0.00018138311005966705, + "loss": 0.9029, + "step": 3365 + }, + { + "epoch": 4.168212739641311, + "grad_norm": 0.6953125, + "learning_rate": 0.0001812993447300979, + "loss": 0.9063, + "step": 3370 + }, + { + "epoch": 4.174397031539889, + "grad_norm": 0.62109375, + "learning_rate": 0.00018121541080872176, + "loss": 0.9096, + "step": 3375 + }, + { + "epoch": 4.180581323438466, + "grad_norm": 0.52734375, + "learning_rate": 0.00018113130846959368, + "loss": 0.9121, + "step": 3380 + }, + { + "epoch": 4.186765615337044, + "grad_norm": 0.470703125, + "learning_rate": 0.00018104703788711816, + "loss": 0.9073, + "step": 3385 + }, + { + "epoch": 4.192949907235621, + "grad_norm": 0.408203125, + "learning_rate": 0.0001809625992360485, + "loss": 0.9113, + "step": 3390 + }, + { + "epoch": 4.199134199134199, + "grad_norm": 0.478515625, + "learning_rate": 0.00018087799269148654, + "loss": 0.8939, + "step": 3395 + }, + { + "epoch": 4.205318491032776, + "grad_norm": 0.466796875, + "learning_rate": 0.00018079321842888227, + "loss": 0.8998, + "step": 3400 + }, + { + "epoch": 4.211502782931355, + "grad_norm": 0.484375, + "learning_rate": 0.00018070827662403349, + "loss": 0.897, + "step": 3405 + }, + { + "epoch": 4.217687074829932, + "grad_norm": 0.55859375, + "learning_rate": 0.00018062316745308542, + "loss": 0.9122, + "step": 3410 + }, + { + "epoch": 4.22387136672851, + "grad_norm": 0.62890625, + "learning_rate": 0.00018053789109253042, + "loss": 0.9, + "step": 3415 + }, + { + "epoch": 4.230055658627087, + "grad_norm": 0.478515625, + "learning_rate": 0.0001804524477192075, + "loss": 0.9108, + "step": 3420 + }, + { + "epoch": 4.236239950525665, + "grad_norm": 0.5234375, + "learning_rate": 0.00018036683751030194, + "loss": 0.9214, + "step": 3425 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.51171875, + "learning_rate": 0.0001802810606433451, + "loss": 0.8972, + "step": 3430 + }, + { + "epoch": 4.24860853432282, + "grad_norm": 0.4375, + "learning_rate": 0.0001801951172962139, + "loss": 0.9142, + "step": 3435 + }, + { + "epoch": 4.254792826221397, + "grad_norm": 0.451171875, + "learning_rate": 0.00018010900764713048, + "loss": 0.8953, + "step": 3440 + }, + { + "epoch": 4.260977118119976, + "grad_norm": 0.453125, + "learning_rate": 0.0001800227318746619, + "loss": 0.9033, + "step": 3445 + }, + { + "epoch": 4.267161410018553, + "grad_norm": 0.453125, + "learning_rate": 0.0001799362901577196, + "loss": 0.896, + "step": 3450 + }, + { + "epoch": 4.273345701917131, + "grad_norm": 0.5390625, + "learning_rate": 0.00017984968267555925, + "loss": 0.8991, + "step": 3455 + }, + { + "epoch": 4.279529993815708, + "grad_norm": 0.421875, + "learning_rate": 0.00017976290960778024, + "loss": 0.9085, + "step": 3460 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.443359375, + "learning_rate": 0.0001796759711343253, + "loss": 0.9209, + "step": 3465 + }, + { + "epoch": 4.291898577612863, + "grad_norm": 0.44140625, + "learning_rate": 0.0001795888674354802, + "loss": 0.9091, + "step": 3470 + }, + { + "epoch": 4.298082869511441, + "grad_norm": 0.55078125, + "learning_rate": 0.00017950159869187333, + "loss": 0.9103, + "step": 3475 + }, + { + "epoch": 4.304267161410019, + "grad_norm": 0.5234375, + "learning_rate": 0.00017941416508447536, + "loss": 0.9115, + "step": 3480 + }, + { + "epoch": 4.3104514533085965, + "grad_norm": 0.5078125, + "learning_rate": 0.0001793265667945988, + "loss": 0.9025, + "step": 3485 + }, + { + "epoch": 4.316635745207174, + "grad_norm": 0.48046875, + "learning_rate": 0.0001792388040038977, + "loss": 0.9113, + "step": 3490 + }, + { + "epoch": 4.3228200371057515, + "grad_norm": 0.44140625, + "learning_rate": 0.0001791508768943672, + "loss": 0.9182, + "step": 3495 + }, + { + "epoch": 4.329004329004329, + "grad_norm": 0.515625, + "learning_rate": 0.00017906278564834324, + "loss": 0.9105, + "step": 3500 + }, + { + "epoch": 4.3351886209029065, + "grad_norm": 0.478515625, + "learning_rate": 0.00017897453044850208, + "loss": 0.9133, + "step": 3505 + }, + { + "epoch": 4.341372912801484, + "grad_norm": 0.478515625, + "learning_rate": 0.00017888611147786002, + "loss": 0.9112, + "step": 3510 + }, + { + "epoch": 4.3475572047000615, + "grad_norm": 0.44140625, + "learning_rate": 0.00017879752891977296, + "loss": 0.9023, + "step": 3515 + }, + { + "epoch": 4.35374149659864, + "grad_norm": 0.451171875, + "learning_rate": 0.00017870878295793598, + "loss": 0.8993, + "step": 3520 + }, + { + "epoch": 4.359925788497217, + "grad_norm": 0.46875, + "learning_rate": 0.00017861987377638312, + "loss": 0.9052, + "step": 3525 + }, + { + "epoch": 4.366110080395795, + "grad_norm": 0.4765625, + "learning_rate": 0.0001785308015594868, + "loss": 0.8988, + "step": 3530 + }, + { + "epoch": 4.372294372294372, + "grad_norm": 0.466796875, + "learning_rate": 0.00017844156649195759, + "loss": 0.9054, + "step": 3535 + }, + { + "epoch": 4.37847866419295, + "grad_norm": 0.458984375, + "learning_rate": 0.00017835216875884368, + "loss": 0.9039, + "step": 3540 + }, + { + "epoch": 4.384662956091527, + "grad_norm": 0.515625, + "learning_rate": 0.00017826260854553072, + "loss": 0.9082, + "step": 3545 + }, + { + "epoch": 4.390847247990105, + "grad_norm": 0.65234375, + "learning_rate": 0.00017817288603774116, + "loss": 0.9025, + "step": 3550 + }, + { + "epoch": 4.397031539888682, + "grad_norm": 0.43359375, + "learning_rate": 0.00017808300142153406, + "loss": 0.9022, + "step": 3555 + }, + { + "epoch": 4.403215831787261, + "grad_norm": 0.40625, + "learning_rate": 0.00017799295488330467, + "loss": 0.8981, + "step": 3560 + }, + { + "epoch": 4.409400123685838, + "grad_norm": 0.453125, + "learning_rate": 0.000177902746609784, + "loss": 0.9073, + "step": 3565 + }, + { + "epoch": 4.415584415584416, + "grad_norm": 0.53125, + "learning_rate": 0.00017781237678803847, + "loss": 0.908, + "step": 3570 + }, + { + "epoch": 4.421768707482993, + "grad_norm": 0.42578125, + "learning_rate": 0.00017772184560546942, + "loss": 0.8984, + "step": 3575 + }, + { + "epoch": 4.427952999381571, + "grad_norm": 0.515625, + "learning_rate": 0.00017763115324981294, + "loss": 0.9122, + "step": 3580 + }, + { + "epoch": 4.434137291280148, + "grad_norm": 0.458984375, + "learning_rate": 0.00017754029990913926, + "loss": 0.9038, + "step": 3585 + }, + { + "epoch": 4.440321583178726, + "grad_norm": 0.51953125, + "learning_rate": 0.00017744928577185243, + "loss": 0.9047, + "step": 3590 + }, + { + "epoch": 4.446505875077303, + "grad_norm": 0.55078125, + "learning_rate": 0.00017735811102669003, + "loss": 0.8955, + "step": 3595 + }, + { + "epoch": 4.452690166975882, + "grad_norm": 0.490234375, + "learning_rate": 0.00017726677586272263, + "loss": 0.8993, + "step": 3600 + }, + { + "epoch": 4.458874458874459, + "grad_norm": 0.42578125, + "learning_rate": 0.0001771752804693535, + "loss": 0.9106, + "step": 3605 + }, + { + "epoch": 4.465058750773037, + "grad_norm": 0.451171875, + "learning_rate": 0.00017708362503631814, + "loss": 0.895, + "step": 3610 + }, + { + "epoch": 4.471243042671614, + "grad_norm": 0.423828125, + "learning_rate": 0.00017699180975368396, + "loss": 0.9059, + "step": 3615 + }, + { + "epoch": 4.477427334570192, + "grad_norm": 0.421875, + "learning_rate": 0.00017689983481184989, + "loss": 0.9031, + "step": 3620 + }, + { + "epoch": 4.483611626468769, + "grad_norm": 0.482421875, + "learning_rate": 0.0001768077004015458, + "loss": 0.9148, + "step": 3625 + }, + { + "epoch": 4.489795918367347, + "grad_norm": 0.47265625, + "learning_rate": 0.00017671540671383243, + "loss": 0.8983, + "step": 3630 + }, + { + "epoch": 4.495980210265925, + "grad_norm": 0.451171875, + "learning_rate": 0.00017662295394010072, + "loss": 0.9008, + "step": 3635 + }, + { + "epoch": 4.5021645021645025, + "grad_norm": 0.443359375, + "learning_rate": 0.00017653034227207152, + "loss": 0.9089, + "step": 3640 + }, + { + "epoch": 4.50834879406308, + "grad_norm": 0.45703125, + "learning_rate": 0.00017643757190179523, + "loss": 0.9176, + "step": 3645 + }, + { + "epoch": 4.5145330859616575, + "grad_norm": 0.46875, + "learning_rate": 0.00017634464302165124, + "loss": 0.9026, + "step": 3650 + }, + { + "epoch": 4.520717377860235, + "grad_norm": 0.40234375, + "learning_rate": 0.00017625155582434777, + "loss": 0.9066, + "step": 3655 + }, + { + "epoch": 4.5269016697588125, + "grad_norm": 0.4296875, + "learning_rate": 0.0001761583105029213, + "loss": 0.9125, + "step": 3660 + }, + { + "epoch": 4.53308596165739, + "grad_norm": 0.482421875, + "learning_rate": 0.00017606490725073615, + "loss": 0.9058, + "step": 3665 + }, + { + "epoch": 4.5392702535559675, + "grad_norm": 0.4296875, + "learning_rate": 0.00017597134626148427, + "loss": 0.8999, + "step": 3670 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.625, + "learning_rate": 0.00017587762772918467, + "loss": 0.9047, + "step": 3675 + }, + { + "epoch": 4.551638837353123, + "grad_norm": 0.52734375, + "learning_rate": 0.0001757837518481829, + "loss": 0.902, + "step": 3680 + }, + { + "epoch": 4.557823129251701, + "grad_norm": 0.48828125, + "learning_rate": 0.00017568971881315104, + "loss": 0.9083, + "step": 3685 + }, + { + "epoch": 4.564007421150278, + "grad_norm": 0.42578125, + "learning_rate": 0.00017559552881908695, + "loss": 0.9084, + "step": 3690 + }, + { + "epoch": 4.570191713048856, + "grad_norm": 0.498046875, + "learning_rate": 0.00017550118206131402, + "loss": 0.905, + "step": 3695 + }, + { + "epoch": 4.576376004947433, + "grad_norm": 0.447265625, + "learning_rate": 0.00017540667873548063, + "loss": 0.8922, + "step": 3700 + }, + { + "epoch": 4.582560296846011, + "grad_norm": 0.41796875, + "learning_rate": 0.00017531201903755994, + "loss": 0.9123, + "step": 3705 + }, + { + "epoch": 4.588744588744589, + "grad_norm": 0.46484375, + "learning_rate": 0.00017521720316384935, + "loss": 0.895, + "step": 3710 + }, + { + "epoch": 4.594928880643167, + "grad_norm": 0.49609375, + "learning_rate": 0.00017512223131097007, + "loss": 0.9068, + "step": 3715 + }, + { + "epoch": 4.601113172541744, + "grad_norm": 0.419921875, + "learning_rate": 0.00017502710367586687, + "loss": 0.8987, + "step": 3720 + }, + { + "epoch": 4.607297464440322, + "grad_norm": 0.431640625, + "learning_rate": 0.0001749318204558075, + "loss": 0.9054, + "step": 3725 + }, + { + "epoch": 4.613481756338899, + "grad_norm": 0.455078125, + "learning_rate": 0.00017483638184838239, + "loss": 0.9075, + "step": 3730 + }, + { + "epoch": 4.619666048237477, + "grad_norm": 0.44140625, + "learning_rate": 0.0001747407880515041, + "loss": 0.9086, + "step": 3735 + }, + { + "epoch": 4.625850340136054, + "grad_norm": 0.44140625, + "learning_rate": 0.0001746450392634071, + "loss": 0.9125, + "step": 3740 + }, + { + "epoch": 4.632034632034632, + "grad_norm": 0.58203125, + "learning_rate": 0.0001745491356826473, + "loss": 0.9123, + "step": 3745 + }, + { + "epoch": 4.638218923933209, + "grad_norm": 0.54296875, + "learning_rate": 0.0001744530775081015, + "loss": 0.9056, + "step": 3750 + }, + { + "epoch": 4.644403215831788, + "grad_norm": 0.59375, + "learning_rate": 0.0001743568649389672, + "loss": 0.9066, + "step": 3755 + }, + { + "epoch": 4.650587507730365, + "grad_norm": 0.63671875, + "learning_rate": 0.00017426049817476197, + "loss": 0.9052, + "step": 3760 + }, + { + "epoch": 4.656771799628943, + "grad_norm": 0.546875, + "learning_rate": 0.00017416397741532315, + "loss": 0.9075, + "step": 3765 + }, + { + "epoch": 4.66295609152752, + "grad_norm": 0.5390625, + "learning_rate": 0.00017406730286080753, + "loss": 0.8945, + "step": 3770 + }, + { + "epoch": 4.669140383426098, + "grad_norm": 0.515625, + "learning_rate": 0.00017397047471169063, + "loss": 0.9086, + "step": 3775 + }, + { + "epoch": 4.675324675324675, + "grad_norm": 0.53515625, + "learning_rate": 0.00017387349316876666, + "loss": 0.9103, + "step": 3780 + }, + { + "epoch": 4.681508967223253, + "grad_norm": 0.484375, + "learning_rate": 0.0001737763584331479, + "loss": 0.9139, + "step": 3785 + }, + { + "epoch": 4.687693259121831, + "grad_norm": 0.439453125, + "learning_rate": 0.00017367907070626424, + "loss": 0.8962, + "step": 3790 + }, + { + "epoch": 4.6938775510204085, + "grad_norm": 0.41015625, + "learning_rate": 0.00017358163018986282, + "loss": 0.9195, + "step": 3795 + }, + { + "epoch": 4.700061842918986, + "grad_norm": 0.490234375, + "learning_rate": 0.00017348403708600772, + "loss": 0.9006, + "step": 3800 + }, + { + "epoch": 4.7062461348175635, + "grad_norm": 0.47265625, + "learning_rate": 0.00017338629159707936, + "loss": 0.9051, + "step": 3805 + }, + { + "epoch": 4.712430426716141, + "grad_norm": 0.458984375, + "learning_rate": 0.0001732883939257742, + "loss": 0.9039, + "step": 3810 + }, + { + "epoch": 4.7186147186147185, + "grad_norm": 0.439453125, + "learning_rate": 0.0001731903442751043, + "loss": 0.8953, + "step": 3815 + }, + { + "epoch": 4.724799010513296, + "grad_norm": 0.4140625, + "learning_rate": 0.00017309214284839678, + "loss": 0.9031, + "step": 3820 + }, + { + "epoch": 4.7309833024118735, + "grad_norm": 0.431640625, + "learning_rate": 0.00017299378984929366, + "loss": 0.8986, + "step": 3825 + }, + { + "epoch": 4.737167594310451, + "grad_norm": 0.5078125, + "learning_rate": 0.00017289528548175114, + "loss": 0.8992, + "step": 3830 + }, + { + "epoch": 4.743351886209029, + "grad_norm": 0.412109375, + "learning_rate": 0.0001727966299500394, + "loss": 0.9061, + "step": 3835 + }, + { + "epoch": 4.749536178107607, + "grad_norm": 0.474609375, + "learning_rate": 0.00017269782345874203, + "loss": 0.8924, + "step": 3840 + }, + { + "epoch": 4.755720470006184, + "grad_norm": 0.447265625, + "learning_rate": 0.00017259886621275573, + "loss": 0.8993, + "step": 3845 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.54296875, + "learning_rate": 0.0001724997584172898, + "loss": 0.8885, + "step": 3850 + }, + { + "epoch": 4.768089053803339, + "grad_norm": 0.498046875, + "learning_rate": 0.0001724005002778657, + "loss": 0.906, + "step": 3855 + }, + { + "epoch": 4.774273345701917, + "grad_norm": 0.5078125, + "learning_rate": 0.00017230109200031668, + "loss": 0.9108, + "step": 3860 + }, + { + "epoch": 4.780457637600494, + "grad_norm": 0.46875, + "learning_rate": 0.00017220153379078737, + "loss": 0.9037, + "step": 3865 + }, + { + "epoch": 4.786641929499073, + "grad_norm": 0.48046875, + "learning_rate": 0.00017210182585573327, + "loss": 0.904, + "step": 3870 + }, + { + "epoch": 4.79282622139765, + "grad_norm": 0.48828125, + "learning_rate": 0.00017200196840192042, + "loss": 0.9066, + "step": 3875 + }, + { + "epoch": 4.799010513296228, + "grad_norm": 0.48828125, + "learning_rate": 0.00017190196163642483, + "loss": 0.8982, + "step": 3880 + }, + { + "epoch": 4.805194805194805, + "grad_norm": 0.443359375, + "learning_rate": 0.00017180180576663228, + "loss": 0.9065, + "step": 3885 + }, + { + "epoch": 4.811379097093383, + "grad_norm": 0.451171875, + "learning_rate": 0.0001717015010002376, + "loss": 0.8865, + "step": 3890 + }, + { + "epoch": 4.81756338899196, + "grad_norm": 0.41796875, + "learning_rate": 0.00017160104754524445, + "loss": 0.9034, + "step": 3895 + }, + { + "epoch": 4.823747680890538, + "grad_norm": 0.408203125, + "learning_rate": 0.00017150044560996488, + "loss": 0.8984, + "step": 3900 + }, + { + "epoch": 4.829931972789115, + "grad_norm": 0.470703125, + "learning_rate": 0.00017139969540301878, + "loss": 0.9073, + "step": 3905 + }, + { + "epoch": 4.836116264687694, + "grad_norm": 0.53515625, + "learning_rate": 0.00017129879713333356, + "loss": 0.9078, + "step": 3910 + }, + { + "epoch": 4.842300556586271, + "grad_norm": 0.46484375, + "learning_rate": 0.00017119775101014358, + "loss": 0.9127, + "step": 3915 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.4765625, + "learning_rate": 0.00017109655724298995, + "loss": 0.9068, + "step": 3920 + }, + { + "epoch": 4.854669140383426, + "grad_norm": 0.515625, + "learning_rate": 0.00017099521604171982, + "loss": 0.8936, + "step": 3925 + }, + { + "epoch": 4.860853432282004, + "grad_norm": 0.45703125, + "learning_rate": 0.00017089372761648616, + "loss": 0.9042, + "step": 3930 + }, + { + "epoch": 4.867037724180581, + "grad_norm": 0.462890625, + "learning_rate": 0.0001707920921777472, + "loss": 0.9065, + "step": 3935 + }, + { + "epoch": 4.873222016079159, + "grad_norm": 0.48828125, + "learning_rate": 0.00017069030993626603, + "loss": 0.8962, + "step": 3940 + }, + { + "epoch": 4.879406307977737, + "grad_norm": 0.4921875, + "learning_rate": 0.00017058838110311017, + "loss": 0.9011, + "step": 3945 + }, + { + "epoch": 4.8855905998763145, + "grad_norm": 0.44140625, + "learning_rate": 0.00017048630588965117, + "loss": 0.8989, + "step": 3950 + }, + { + "epoch": 4.891774891774892, + "grad_norm": 0.51953125, + "learning_rate": 0.0001703840845075641, + "loss": 0.9101, + "step": 3955 + }, + { + "epoch": 4.8979591836734695, + "grad_norm": 0.5078125, + "learning_rate": 0.00017028171716882714, + "loss": 0.8964, + "step": 3960 + }, + { + "epoch": 4.904143475572047, + "grad_norm": 0.490234375, + "learning_rate": 0.00017017920408572115, + "loss": 0.9077, + "step": 3965 + }, + { + "epoch": 4.9103277674706245, + "grad_norm": 0.58203125, + "learning_rate": 0.00017007654547082922, + "loss": 0.9015, + "step": 3970 + }, + { + "epoch": 4.916512059369202, + "grad_norm": 0.458984375, + "learning_rate": 0.00016997374153703625, + "loss": 0.8991, + "step": 3975 + }, + { + "epoch": 4.9226963512677795, + "grad_norm": 0.416015625, + "learning_rate": 0.00016987079249752843, + "loss": 0.9045, + "step": 3980 + }, + { + "epoch": 4.928880643166357, + "grad_norm": 0.423828125, + "learning_rate": 0.0001697676985657929, + "loss": 0.9004, + "step": 3985 + }, + { + "epoch": 4.935064935064935, + "grad_norm": 0.47265625, + "learning_rate": 0.00016966445995561727, + "loss": 0.8999, + "step": 3990 + }, + { + "epoch": 4.941249226963513, + "grad_norm": 0.44921875, + "learning_rate": 0.00016956107688108923, + "loss": 0.9044, + "step": 3995 + }, + { + "epoch": 4.94743351886209, + "grad_norm": 0.443359375, + "learning_rate": 0.00016945754955659595, + "loss": 0.9037, + "step": 4000 + }, + { + "epoch": 4.953617810760668, + "grad_norm": 0.4921875, + "learning_rate": 0.00016935387819682376, + "loss": 0.9038, + "step": 4005 + }, + { + "epoch": 4.959802102659245, + "grad_norm": 0.435546875, + "learning_rate": 0.00016925006301675763, + "loss": 0.8995, + "step": 4010 + }, + { + "epoch": 4.965986394557823, + "grad_norm": 0.439453125, + "learning_rate": 0.00016914610423168094, + "loss": 0.9127, + "step": 4015 + }, + { + "epoch": 4.9721706864564, + "grad_norm": 0.44140625, + "learning_rate": 0.0001690420020571747, + "loss": 0.8946, + "step": 4020 + }, + { + "epoch": 4.978354978354979, + "grad_norm": 0.5390625, + "learning_rate": 0.00016893775670911732, + "loss": 0.9108, + "step": 4025 + }, + { + "epoch": 4.984539270253556, + "grad_norm": 0.412109375, + "learning_rate": 0.00016883336840368412, + "loss": 0.9083, + "step": 4030 + }, + { + "epoch": 4.990723562152134, + "grad_norm": 0.45703125, + "learning_rate": 0.0001687288373573469, + "loss": 0.9043, + "step": 4035 + }, + { + "epoch": 4.996907854050711, + "grad_norm": 0.427734375, + "learning_rate": 0.0001686241637868734, + "loss": 0.9099, + "step": 4040 + }, + { + "epoch": 4.999381570810142, + "eval_loss": 2.483851194381714, + "eval_runtime": 0.6403, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.562, + "step": 4042 + }, + { + "epoch": 5.003092145949289, + "grad_norm": 0.54296875, + "learning_rate": 0.00016851934790932692, + "loss": 0.903, + "step": 4045 + }, + { + "epoch": 5.009276437847866, + "grad_norm": 0.4296875, + "learning_rate": 0.00016841438994206595, + "loss": 0.8894, + "step": 4050 + }, + { + "epoch": 5.015460729746444, + "grad_norm": 0.515625, + "learning_rate": 0.0001683092901027436, + "loss": 0.9058, + "step": 4055 + }, + { + "epoch": 5.021645021645021, + "grad_norm": 0.46875, + "learning_rate": 0.0001682040486093071, + "loss": 0.8982, + "step": 4060 + }, + { + "epoch": 5.0278293135436, + "grad_norm": 0.486328125, + "learning_rate": 0.0001680986656799975, + "loss": 0.8948, + "step": 4065 + }, + { + "epoch": 5.034013605442177, + "grad_norm": 0.5234375, + "learning_rate": 0.00016799314153334916, + "loss": 0.8902, + "step": 4070 + }, + { + "epoch": 5.040197897340755, + "grad_norm": 0.41796875, + "learning_rate": 0.00016788747638818926, + "loss": 0.8845, + "step": 4075 + }, + { + "epoch": 5.046382189239332, + "grad_norm": 0.45703125, + "learning_rate": 0.00016778167046363734, + "loss": 0.8875, + "step": 4080 + }, + { + "epoch": 5.05256648113791, + "grad_norm": 0.44140625, + "learning_rate": 0.0001676757239791049, + "loss": 0.8976, + "step": 4085 + }, + { + "epoch": 5.058750773036487, + "grad_norm": 0.50390625, + "learning_rate": 0.00016756963715429502, + "loss": 0.887, + "step": 4090 + }, + { + "epoch": 5.064935064935065, + "grad_norm": 0.421875, + "learning_rate": 0.00016746341020920167, + "loss": 0.8852, + "step": 4095 + }, + { + "epoch": 5.071119356833642, + "grad_norm": 0.3984375, + "learning_rate": 0.00016735704336410943, + "loss": 0.8966, + "step": 4100 + }, + { + "epoch": 5.0773036487322205, + "grad_norm": 0.466796875, + "learning_rate": 0.000167250536839593, + "loss": 0.9055, + "step": 4105 + }, + { + "epoch": 5.083487940630798, + "grad_norm": 0.5, + "learning_rate": 0.0001671438908565167, + "loss": 0.8904, + "step": 4110 + }, + { + "epoch": 5.0896722325293755, + "grad_norm": 0.482421875, + "learning_rate": 0.00016703710563603416, + "loss": 0.8829, + "step": 4115 + }, + { + "epoch": 5.095856524427953, + "grad_norm": 0.451171875, + "learning_rate": 0.00016693018139958763, + "loss": 0.8835, + "step": 4120 + }, + { + "epoch": 5.1020408163265305, + "grad_norm": 0.466796875, + "learning_rate": 0.00016682311836890766, + "loss": 0.8927, + "step": 4125 + }, + { + "epoch": 5.108225108225108, + "grad_norm": 0.458984375, + "learning_rate": 0.00016671591676601272, + "loss": 0.8803, + "step": 4130 + }, + { + "epoch": 5.1144094001236855, + "grad_norm": 0.51953125, + "learning_rate": 0.0001666085768132085, + "loss": 0.8923, + "step": 4135 + }, + { + "epoch": 5.120593692022264, + "grad_norm": 0.55078125, + "learning_rate": 0.00016650109873308765, + "loss": 0.8963, + "step": 4140 + }, + { + "epoch": 5.126777983920841, + "grad_norm": 0.458984375, + "learning_rate": 0.00016639348274852925, + "loss": 0.8986, + "step": 4145 + }, + { + "epoch": 5.132962275819419, + "grad_norm": 0.53515625, + "learning_rate": 0.00016628572908269841, + "loss": 0.8841, + "step": 4150 + }, + { + "epoch": 5.139146567717996, + "grad_norm": 0.5390625, + "learning_rate": 0.00016617783795904565, + "loss": 0.8892, + "step": 4155 + }, + { + "epoch": 5.145330859616574, + "grad_norm": 0.578125, + "learning_rate": 0.00016606980960130665, + "loss": 0.892, + "step": 4160 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 0.55078125, + "learning_rate": 0.00016596164423350157, + "loss": 0.8884, + "step": 4165 + }, + { + "epoch": 5.157699443413729, + "grad_norm": 0.5078125, + "learning_rate": 0.00016585334207993476, + "loss": 0.9061, + "step": 4170 + }, + { + "epoch": 5.163883735312306, + "grad_norm": 0.51171875, + "learning_rate": 0.00016574490336519418, + "loss": 0.8863, + "step": 4175 + }, + { + "epoch": 5.170068027210885, + "grad_norm": 0.51953125, + "learning_rate": 0.00016563632831415102, + "loss": 0.8963, + "step": 4180 + }, + { + "epoch": 5.176252319109462, + "grad_norm": 0.48828125, + "learning_rate": 0.00016552761715195918, + "loss": 0.8934, + "step": 4185 + }, + { + "epoch": 5.18243661100804, + "grad_norm": 0.66015625, + "learning_rate": 0.00016541877010405477, + "loss": 0.9016, + "step": 4190 + }, + { + "epoch": 5.188620902906617, + "grad_norm": 0.53515625, + "learning_rate": 0.00016530978739615578, + "loss": 0.8868, + "step": 4195 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 0.498046875, + "learning_rate": 0.00016520066925426144, + "loss": 0.8907, + "step": 4200 + }, + { + "epoch": 5.200989486703772, + "grad_norm": 0.5, + "learning_rate": 0.00016509141590465189, + "loss": 0.8899, + "step": 4205 + }, + { + "epoch": 5.20717377860235, + "grad_norm": 0.4375, + "learning_rate": 0.00016498202757388758, + "loss": 0.89, + "step": 4210 + }, + { + "epoch": 5.213358070500927, + "grad_norm": 0.470703125, + "learning_rate": 0.00016487250448880893, + "loss": 0.8961, + "step": 4215 + }, + { + "epoch": 5.219542362399506, + "grad_norm": 0.466796875, + "learning_rate": 0.0001647628468765358, + "loss": 0.8962, + "step": 4220 + }, + { + "epoch": 5.225726654298083, + "grad_norm": 0.4453125, + "learning_rate": 0.00016465305496446702, + "loss": 0.8974, + "step": 4225 + }, + { + "epoch": 5.231910946196661, + "grad_norm": 0.42578125, + "learning_rate": 0.0001645431289802799, + "loss": 0.8987, + "step": 4230 + }, + { + "epoch": 5.238095238095238, + "grad_norm": 0.458984375, + "learning_rate": 0.00016443306915192975, + "loss": 0.8891, + "step": 4235 + }, + { + "epoch": 5.244279529993816, + "grad_norm": 0.470703125, + "learning_rate": 0.00016432287570764952, + "loss": 0.8979, + "step": 4240 + }, + { + "epoch": 5.250463821892393, + "grad_norm": 0.490234375, + "learning_rate": 0.00016421254887594917, + "loss": 0.8971, + "step": 4245 + }, + { + "epoch": 5.256648113790971, + "grad_norm": 0.48046875, + "learning_rate": 0.0001641020888856153, + "loss": 0.8943, + "step": 4250 + }, + { + "epoch": 5.262832405689548, + "grad_norm": 0.439453125, + "learning_rate": 0.00016399149596571064, + "loss": 0.901, + "step": 4255 + }, + { + "epoch": 5.2690166975881265, + "grad_norm": 0.458984375, + "learning_rate": 0.00016388077034557355, + "loss": 0.874, + "step": 4260 + }, + { + "epoch": 5.275200989486704, + "grad_norm": 0.51953125, + "learning_rate": 0.0001637699122548176, + "loss": 0.894, + "step": 4265 + }, + { + "epoch": 5.2813852813852815, + "grad_norm": 0.41796875, + "learning_rate": 0.0001636589219233311, + "loss": 0.8834, + "step": 4270 + }, + { + "epoch": 5.287569573283859, + "grad_norm": 0.435546875, + "learning_rate": 0.0001635477995812765, + "loss": 0.9, + "step": 4275 + }, + { + "epoch": 5.2937538651824365, + "grad_norm": 0.44140625, + "learning_rate": 0.00016343654545909007, + "loss": 0.897, + "step": 4280 + }, + { + "epoch": 5.299938157081014, + "grad_norm": 0.546875, + "learning_rate": 0.00016332515978748134, + "loss": 0.8738, + "step": 4285 + }, + { + "epoch": 5.3061224489795915, + "grad_norm": 0.48828125, + "learning_rate": 0.00016321364279743266, + "loss": 0.8839, + "step": 4290 + }, + { + "epoch": 5.312306740878169, + "grad_norm": 0.546875, + "learning_rate": 0.00016310199472019865, + "loss": 0.8876, + "step": 4295 + }, + { + "epoch": 5.318491032776747, + "grad_norm": 0.427734375, + "learning_rate": 0.00016299021578730579, + "loss": 0.8807, + "step": 4300 + }, + { + "epoch": 5.324675324675325, + "grad_norm": 0.484375, + "learning_rate": 0.00016287830623055188, + "loss": 0.8866, + "step": 4305 + }, + { + "epoch": 5.330859616573902, + "grad_norm": 0.47265625, + "learning_rate": 0.00016276626628200568, + "loss": 0.8949, + "step": 4310 + }, + { + "epoch": 5.33704390847248, + "grad_norm": 0.486328125, + "learning_rate": 0.00016265409617400632, + "loss": 0.8963, + "step": 4315 + }, + { + "epoch": 5.343228200371057, + "grad_norm": 0.41796875, + "learning_rate": 0.00016254179613916278, + "loss": 0.8974, + "step": 4320 + }, + { + "epoch": 5.349412492269635, + "grad_norm": 0.46875, + "learning_rate": 0.00016242936641035357, + "loss": 0.8935, + "step": 4325 + }, + { + "epoch": 5.355596784168212, + "grad_norm": 0.431640625, + "learning_rate": 0.000162316807220726, + "loss": 0.8888, + "step": 4330 + }, + { + "epoch": 5.361781076066791, + "grad_norm": 0.46875, + "learning_rate": 0.00016220411880369601, + "loss": 0.8833, + "step": 4335 + }, + { + "epoch": 5.367965367965368, + "grad_norm": 0.49609375, + "learning_rate": 0.00016209130139294744, + "loss": 0.8855, + "step": 4340 + }, + { + "epoch": 5.374149659863946, + "grad_norm": 0.443359375, + "learning_rate": 0.00016197835522243162, + "loss": 0.8867, + "step": 4345 + }, + { + "epoch": 5.380333951762523, + "grad_norm": 0.5859375, + "learning_rate": 0.00016186528052636692, + "loss": 0.8878, + "step": 4350 + }, + { + "epoch": 5.386518243661101, + "grad_norm": 0.455078125, + "learning_rate": 0.00016175207753923822, + "loss": 0.8938, + "step": 4355 + }, + { + "epoch": 5.392702535559678, + "grad_norm": 0.38671875, + "learning_rate": 0.00016163874649579647, + "loss": 0.8857, + "step": 4360 + }, + { + "epoch": 5.398886827458256, + "grad_norm": 0.482421875, + "learning_rate": 0.0001615252876310581, + "loss": 0.889, + "step": 4365 + }, + { + "epoch": 5.405071119356833, + "grad_norm": 0.5625, + "learning_rate": 0.00016141170118030463, + "loss": 0.8858, + "step": 4370 + }, + { + "epoch": 5.411255411255412, + "grad_norm": 0.470703125, + "learning_rate": 0.00016129798737908225, + "loss": 0.8901, + "step": 4375 + }, + { + "epoch": 5.417439703153989, + "grad_norm": 0.48046875, + "learning_rate": 0.0001611841464632011, + "loss": 0.8935, + "step": 4380 + }, + { + "epoch": 5.423623995052567, + "grad_norm": 0.462890625, + "learning_rate": 0.00016107017866873505, + "loss": 0.8871, + "step": 4385 + }, + { + "epoch": 5.429808286951144, + "grad_norm": 0.478515625, + "learning_rate": 0.00016095608423202098, + "loss": 0.8979, + "step": 4390 + }, + { + "epoch": 5.435992578849722, + "grad_norm": 0.4921875, + "learning_rate": 0.00016084186338965843, + "loss": 0.8893, + "step": 4395 + }, + { + "epoch": 5.442176870748299, + "grad_norm": 0.5234375, + "learning_rate": 0.00016072751637850904, + "loss": 0.8928, + "step": 4400 + }, + { + "epoch": 5.448361162646877, + "grad_norm": 0.474609375, + "learning_rate": 0.00016061304343569614, + "loss": 0.8916, + "step": 4405 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.45703125, + "learning_rate": 0.00016049844479860422, + "loss": 0.8866, + "step": 4410 + }, + { + "epoch": 5.4607297464440325, + "grad_norm": 0.419921875, + "learning_rate": 0.00016038372070487832, + "loss": 0.9007, + "step": 4415 + }, + { + "epoch": 5.46691403834261, + "grad_norm": 0.54296875, + "learning_rate": 0.00016026887139242372, + "loss": 0.8922, + "step": 4420 + }, + { + "epoch": 5.4730983302411875, + "grad_norm": 0.46875, + "learning_rate": 0.00016015389709940538, + "loss": 0.8912, + "step": 4425 + }, + { + "epoch": 5.479282622139765, + "grad_norm": 0.482421875, + "learning_rate": 0.0001600387980642474, + "loss": 0.8891, + "step": 4430 + }, + { + "epoch": 5.4854669140383425, + "grad_norm": 0.51171875, + "learning_rate": 0.00015992357452563255, + "loss": 0.8973, + "step": 4435 + }, + { + "epoch": 5.49165120593692, + "grad_norm": 0.54296875, + "learning_rate": 0.0001598082267225018, + "loss": 0.9042, + "step": 4440 + }, + { + "epoch": 5.4978354978354975, + "grad_norm": 0.58984375, + "learning_rate": 0.00015969275489405383, + "loss": 0.8975, + "step": 4445 + }, + { + "epoch": 5.504019789734075, + "grad_norm": 0.4921875, + "learning_rate": 0.0001595771592797445, + "loss": 0.8788, + "step": 4450 + }, + { + "epoch": 5.510204081632653, + "grad_norm": 0.54296875, + "learning_rate": 0.00015946144011928638, + "loss": 0.8986, + "step": 4455 + }, + { + "epoch": 5.516388373531231, + "grad_norm": 0.53515625, + "learning_rate": 0.0001593455976526482, + "loss": 0.8966, + "step": 4460 + }, + { + "epoch": 5.522572665429808, + "grad_norm": 0.462890625, + "learning_rate": 0.00015922963212005442, + "loss": 0.8858, + "step": 4465 + }, + { + "epoch": 5.528756957328386, + "grad_norm": 0.482421875, + "learning_rate": 0.0001591135437619847, + "loss": 0.8931, + "step": 4470 + }, + { + "epoch": 5.534941249226963, + "grad_norm": 0.52734375, + "learning_rate": 0.0001589973328191734, + "loss": 0.8913, + "step": 4475 + }, + { + "epoch": 5.541125541125541, + "grad_norm": 0.45703125, + "learning_rate": 0.00015888099953260905, + "loss": 0.8913, + "step": 4480 + }, + { + "epoch": 5.547309833024118, + "grad_norm": 0.4375, + "learning_rate": 0.000158764544143534, + "loss": 0.8886, + "step": 4485 + }, + { + "epoch": 5.553494124922697, + "grad_norm": 0.42578125, + "learning_rate": 0.0001586479668934437, + "loss": 0.9026, + "step": 4490 + }, + { + "epoch": 5.559678416821274, + "grad_norm": 0.466796875, + "learning_rate": 0.00015853126802408633, + "loss": 0.8857, + "step": 4495 + }, + { + "epoch": 5.565862708719852, + "grad_norm": 0.44921875, + "learning_rate": 0.0001584144477774623, + "loss": 0.8948, + "step": 4500 + }, + { + "epoch": 5.572047000618429, + "grad_norm": 0.5390625, + "learning_rate": 0.0001582975063958237, + "loss": 0.8877, + "step": 4505 + }, + { + "epoch": 5.578231292517007, + "grad_norm": 0.73828125, + "learning_rate": 0.0001581804441216738, + "loss": 0.8958, + "step": 4510 + }, + { + "epoch": 5.584415584415584, + "grad_norm": 0.482421875, + "learning_rate": 0.00015806326119776663, + "loss": 0.8914, + "step": 4515 + }, + { + "epoch": 5.590599876314162, + "grad_norm": 0.4140625, + "learning_rate": 0.00015794595786710632, + "loss": 0.8937, + "step": 4520 + }, + { + "epoch": 5.596784168212739, + "grad_norm": 0.419921875, + "learning_rate": 0.0001578285343729468, + "loss": 0.8957, + "step": 4525 + }, + { + "epoch": 5.602968460111317, + "grad_norm": 0.71875, + "learning_rate": 0.00015771099095879108, + "loss": 0.8916, + "step": 4530 + }, + { + "epoch": 5.609152752009895, + "grad_norm": 0.5078125, + "learning_rate": 0.00015759332786839092, + "loss": 0.891, + "step": 4535 + }, + { + "epoch": 5.615337043908473, + "grad_norm": 0.48828125, + "learning_rate": 0.00015747554534574626, + "loss": 0.8814, + "step": 4540 + }, + { + "epoch": 5.62152133580705, + "grad_norm": 0.451171875, + "learning_rate": 0.0001573576436351046, + "loss": 0.8883, + "step": 4545 + }, + { + "epoch": 5.627705627705628, + "grad_norm": 0.44921875, + "learning_rate": 0.0001572396229809608, + "loss": 0.8992, + "step": 4550 + }, + { + "epoch": 5.633889919604205, + "grad_norm": 0.4609375, + "learning_rate": 0.00015712148362805617, + "loss": 0.883, + "step": 4555 + }, + { + "epoch": 5.640074211502783, + "grad_norm": 0.4296875, + "learning_rate": 0.00015700322582137827, + "loss": 0.8908, + "step": 4560 + }, + { + "epoch": 5.646258503401361, + "grad_norm": 0.4453125, + "learning_rate": 0.00015688484980616032, + "loss": 0.8953, + "step": 4565 + }, + { + "epoch": 5.6524427952999385, + "grad_norm": 0.4609375, + "learning_rate": 0.0001567663558278806, + "loss": 0.8968, + "step": 4570 + }, + { + "epoch": 5.658627087198516, + "grad_norm": 0.447265625, + "learning_rate": 0.0001566477441322621, + "loss": 0.8788, + "step": 4575 + }, + { + "epoch": 5.6648113790970935, + "grad_norm": 0.470703125, + "learning_rate": 0.0001565290149652718, + "loss": 0.8847, + "step": 4580 + }, + { + "epoch": 5.670995670995671, + "grad_norm": 0.470703125, + "learning_rate": 0.00015641016857312044, + "loss": 0.8916, + "step": 4585 + }, + { + "epoch": 5.6771799628942485, + "grad_norm": 0.486328125, + "learning_rate": 0.00015629120520226165, + "loss": 0.8942, + "step": 4590 + }, + { + "epoch": 5.683364254792826, + "grad_norm": 0.44140625, + "learning_rate": 0.00015617212509939186, + "loss": 0.893, + "step": 4595 + }, + { + "epoch": 5.6895485466914035, + "grad_norm": 0.44921875, + "learning_rate": 0.00015605292851144942, + "loss": 0.8956, + "step": 4600 + }, + { + "epoch": 5.695732838589981, + "grad_norm": 0.546875, + "learning_rate": 0.00015593361568561428, + "loss": 0.8978, + "step": 4605 + }, + { + "epoch": 5.701917130488559, + "grad_norm": 0.421875, + "learning_rate": 0.00015581418686930743, + "loss": 0.8867, + "step": 4610 + }, + { + "epoch": 5.708101422387137, + "grad_norm": 0.490234375, + "learning_rate": 0.0001556946423101905, + "loss": 0.8914, + "step": 4615 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.4609375, + "learning_rate": 0.00015557498225616487, + "loss": 0.892, + "step": 4620 + }, + { + "epoch": 5.720470006184292, + "grad_norm": 0.453125, + "learning_rate": 0.00015545520695537169, + "loss": 0.8906, + "step": 4625 + }, + { + "epoch": 5.726654298082869, + "grad_norm": 0.474609375, + "learning_rate": 0.00015533531665619098, + "loss": 0.8893, + "step": 4630 + }, + { + "epoch": 5.732838589981447, + "grad_norm": 0.46484375, + "learning_rate": 0.00015521531160724126, + "loss": 0.8866, + "step": 4635 + }, + { + "epoch": 5.739022881880024, + "grad_norm": 0.462890625, + "learning_rate": 0.00015509519205737896, + "loss": 0.907, + "step": 4640 + }, + { + "epoch": 5.745207173778603, + "grad_norm": 0.4921875, + "learning_rate": 0.00015497495825569807, + "loss": 0.8901, + "step": 4645 + }, + { + "epoch": 5.75139146567718, + "grad_norm": 0.486328125, + "learning_rate": 0.0001548546104515294, + "loss": 0.8843, + "step": 4650 + }, + { + "epoch": 5.757575757575758, + "grad_norm": 0.51953125, + "learning_rate": 0.00015473414889444014, + "loss": 0.8911, + "step": 4655 + }, + { + "epoch": 5.763760049474335, + "grad_norm": 0.625, + "learning_rate": 0.0001546135738342335, + "loss": 0.8918, + "step": 4660 + }, + { + "epoch": 5.769944341372913, + "grad_norm": 0.55859375, + "learning_rate": 0.00015449288552094796, + "loss": 0.8893, + "step": 4665 + }, + { + "epoch": 5.77612863327149, + "grad_norm": 0.51171875, + "learning_rate": 0.0001543720842048569, + "loss": 0.8921, + "step": 4670 + }, + { + "epoch": 5.782312925170068, + "grad_norm": 0.482421875, + "learning_rate": 0.000154251170136468, + "loss": 0.8927, + "step": 4675 + }, + { + "epoch": 5.788497217068645, + "grad_norm": 0.5, + "learning_rate": 0.00015413014356652286, + "loss": 0.8886, + "step": 4680 + }, + { + "epoch": 5.794681508967223, + "grad_norm": 0.482421875, + "learning_rate": 0.0001540090047459962, + "loss": 0.8811, + "step": 4685 + }, + { + "epoch": 5.800865800865801, + "grad_norm": 0.58984375, + "learning_rate": 0.00015388775392609564, + "loss": 0.8938, + "step": 4690 + }, + { + "epoch": 5.807050092764379, + "grad_norm": 0.427734375, + "learning_rate": 0.00015376639135826107, + "loss": 0.8838, + "step": 4695 + }, + { + "epoch": 5.813234384662956, + "grad_norm": 0.419921875, + "learning_rate": 0.000153644917294164, + "loss": 0.8913, + "step": 4700 + }, + { + "epoch": 5.819418676561534, + "grad_norm": 0.458984375, + "learning_rate": 0.0001535233319857073, + "loss": 0.8946, + "step": 4705 + }, + { + "epoch": 5.825602968460111, + "grad_norm": 0.453125, + "learning_rate": 0.0001534016356850244, + "loss": 0.8948, + "step": 4710 + }, + { + "epoch": 5.831787260358689, + "grad_norm": 0.4609375, + "learning_rate": 0.00015327982864447894, + "loss": 0.8854, + "step": 4715 + }, + { + "epoch": 5.837971552257266, + "grad_norm": 0.47265625, + "learning_rate": 0.00015315791111666425, + "loss": 0.8961, + "step": 4720 + }, + { + "epoch": 5.8441558441558445, + "grad_norm": 0.435546875, + "learning_rate": 0.00015303588335440274, + "loss": 0.8981, + "step": 4725 + }, + { + "epoch": 5.850340136054422, + "grad_norm": 0.46875, + "learning_rate": 0.00015291374561074536, + "loss": 0.8957, + "step": 4730 + }, + { + "epoch": 5.8565244279529995, + "grad_norm": 0.4609375, + "learning_rate": 0.00015279149813897126, + "loss": 0.8876, + "step": 4735 + }, + { + "epoch": 5.862708719851577, + "grad_norm": 0.43359375, + "learning_rate": 0.000152669141192587, + "loss": 0.8829, + "step": 4740 + }, + { + "epoch": 5.8688930117501545, + "grad_norm": 0.462890625, + "learning_rate": 0.00015254667502532626, + "loss": 0.8997, + "step": 4745 + }, + { + "epoch": 5.875077303648732, + "grad_norm": 0.431640625, + "learning_rate": 0.00015242409989114916, + "loss": 0.8926, + "step": 4750 + }, + { + "epoch": 5.8812615955473095, + "grad_norm": 0.42578125, + "learning_rate": 0.00015230141604424181, + "loss": 0.8902, + "step": 4755 + }, + { + "epoch": 5.887445887445887, + "grad_norm": 0.453125, + "learning_rate": 0.00015217862373901575, + "loss": 0.8962, + "step": 4760 + }, + { + "epoch": 5.893630179344465, + "grad_norm": 0.546875, + "learning_rate": 0.0001520557232301074, + "loss": 0.8931, + "step": 4765 + }, + { + "epoch": 5.899814471243043, + "grad_norm": 0.50390625, + "learning_rate": 0.0001519327147723776, + "loss": 0.8883, + "step": 4770 + }, + { + "epoch": 5.90599876314162, + "grad_norm": 0.447265625, + "learning_rate": 0.0001518095986209111, + "loss": 0.8953, + "step": 4775 + }, + { + "epoch": 5.912183055040198, + "grad_norm": 0.484375, + "learning_rate": 0.00015168637503101584, + "loss": 0.9026, + "step": 4780 + }, + { + "epoch": 5.918367346938775, + "grad_norm": 0.484375, + "learning_rate": 0.00015156304425822267, + "loss": 0.8927, + "step": 4785 + }, + { + "epoch": 5.924551638837353, + "grad_norm": 0.439453125, + "learning_rate": 0.00015143960655828468, + "loss": 0.8968, + "step": 4790 + }, + { + "epoch": 5.93073593073593, + "grad_norm": 0.44921875, + "learning_rate": 0.00015131606218717666, + "loss": 0.9022, + "step": 4795 + }, + { + "epoch": 5.936920222634509, + "grad_norm": 0.4609375, + "learning_rate": 0.00015119241140109467, + "loss": 0.8945, + "step": 4800 + }, + { + "epoch": 5.943104514533086, + "grad_norm": 0.427734375, + "learning_rate": 0.00015106865445645536, + "loss": 0.8783, + "step": 4805 + }, + { + "epoch": 5.949288806431664, + "grad_norm": 0.44921875, + "learning_rate": 0.0001509447916098956, + "loss": 0.8833, + "step": 4810 + }, + { + "epoch": 5.955473098330241, + "grad_norm": 0.486328125, + "learning_rate": 0.00015082082311827183, + "loss": 0.8861, + "step": 4815 + }, + { + "epoch": 5.961657390228819, + "grad_norm": 0.458984375, + "learning_rate": 0.0001506967492386596, + "loss": 0.8853, + "step": 4820 + }, + { + "epoch": 5.967841682127396, + "grad_norm": 0.453125, + "learning_rate": 0.00015057257022835295, + "loss": 0.8897, + "step": 4825 + }, + { + "epoch": 5.974025974025974, + "grad_norm": 0.42578125, + "learning_rate": 0.000150448286344864, + "loss": 0.8951, + "step": 4830 + }, + { + "epoch": 5.980210265924551, + "grad_norm": 0.4296875, + "learning_rate": 0.00015032389784592226, + "loss": 0.8823, + "step": 4835 + }, + { + "epoch": 5.986394557823129, + "grad_norm": 0.43359375, + "learning_rate": 0.00015019940498947428, + "loss": 0.8795, + "step": 4840 + }, + { + "epoch": 5.992578849721707, + "grad_norm": 0.48828125, + "learning_rate": 0.000150074808033683, + "loss": 0.9034, + "step": 4845 + }, + { + "epoch": 5.998763141620285, + "grad_norm": 0.4453125, + "learning_rate": 0.00014995010723692714, + "loss": 0.8939, + "step": 4850 + }, + { + "epoch": 6.0, + "eval_loss": 2.4935665130615234, + "eval_runtime": 0.5407, + "eval_samples_per_second": 18.496, + "eval_steps_per_second": 1.85, + "step": 4851 + }, + { + "epoch": 6.004947433518862, + "grad_norm": 0.48046875, + "learning_rate": 0.00014982530285780082, + "loss": 0.8858, + "step": 4855 + }, + { + "epoch": 6.01113172541744, + "grad_norm": 0.47265625, + "learning_rate": 0.00014970039515511304, + "loss": 0.8882, + "step": 4860 + }, + { + "epoch": 6.017316017316017, + "grad_norm": 0.435546875, + "learning_rate": 0.0001495753843878869, + "loss": 0.8914, + "step": 4865 + }, + { + "epoch": 6.023500309214595, + "grad_norm": 0.443359375, + "learning_rate": 0.00014945027081535937, + "loss": 0.8648, + "step": 4870 + }, + { + "epoch": 6.029684601113172, + "grad_norm": 0.5234375, + "learning_rate": 0.00014932505469698052, + "loss": 0.87, + "step": 4875 + }, + { + "epoch": 6.035868893011751, + "grad_norm": 0.43359375, + "learning_rate": 0.00014919973629241314, + "loss": 0.8876, + "step": 4880 + }, + { + "epoch": 6.042053184910328, + "grad_norm": 0.48828125, + "learning_rate": 0.00014907431586153201, + "loss": 0.8852, + "step": 4885 + }, + { + "epoch": 6.0482374768089056, + "grad_norm": 0.447265625, + "learning_rate": 0.0001489487936644237, + "loss": 0.8752, + "step": 4890 + }, + { + "epoch": 6.054421768707483, + "grad_norm": 0.458984375, + "learning_rate": 0.00014882316996138556, + "loss": 0.8855, + "step": 4895 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 0.54296875, + "learning_rate": 0.00014869744501292561, + "loss": 0.8878, + "step": 4900 + }, + { + "epoch": 6.066790352504638, + "grad_norm": 0.56640625, + "learning_rate": 0.00014857161907976183, + "loss": 0.8818, + "step": 4905 + }, + { + "epoch": 6.0729746444032155, + "grad_norm": 0.435546875, + "learning_rate": 0.00014844569242282148, + "loss": 0.8859, + "step": 4910 + }, + { + "epoch": 6.079158936301793, + "grad_norm": 0.5625, + "learning_rate": 0.00014831966530324078, + "loss": 0.876, + "step": 4915 + }, + { + "epoch": 6.085343228200371, + "grad_norm": 0.49609375, + "learning_rate": 0.00014819353798236427, + "loss": 0.8756, + "step": 4920 + }, + { + "epoch": 6.091527520098949, + "grad_norm": 0.50390625, + "learning_rate": 0.00014806731072174428, + "loss": 0.8867, + "step": 4925 + }, + { + "epoch": 6.097711811997526, + "grad_norm": 0.453125, + "learning_rate": 0.0001479409837831404, + "loss": 0.8736, + "step": 4930 + }, + { + "epoch": 6.103896103896104, + "grad_norm": 0.5859375, + "learning_rate": 0.00014781455742851892, + "loss": 0.878, + "step": 4935 + }, + { + "epoch": 6.110080395794681, + "grad_norm": 0.42578125, + "learning_rate": 0.00014768803192005223, + "loss": 0.8853, + "step": 4940 + }, + { + "epoch": 6.116264687693259, + "grad_norm": 0.6015625, + "learning_rate": 0.00014756140752011842, + "loss": 0.8778, + "step": 4945 + }, + { + "epoch": 6.122448979591836, + "grad_norm": 0.5625, + "learning_rate": 0.00014743468449130063, + "loss": 0.891, + "step": 4950 + }, + { + "epoch": 6.128633271490414, + "grad_norm": 0.48828125, + "learning_rate": 0.00014730786309638652, + "loss": 0.8794, + "step": 4955 + }, + { + "epoch": 6.134817563388992, + "grad_norm": 0.462890625, + "learning_rate": 0.00014718094359836772, + "loss": 0.8913, + "step": 4960 + }, + { + "epoch": 6.14100185528757, + "grad_norm": 0.4375, + "learning_rate": 0.0001470539262604393, + "loss": 0.8825, + "step": 4965 + }, + { + "epoch": 6.147186147186147, + "grad_norm": 0.5703125, + "learning_rate": 0.00014692681134599925, + "loss": 0.8865, + "step": 4970 + }, + { + "epoch": 6.153370439084725, + "grad_norm": 0.48046875, + "learning_rate": 0.00014679959911864784, + "loss": 0.8829, + "step": 4975 + }, + { + "epoch": 6.159554730983302, + "grad_norm": 0.4375, + "learning_rate": 0.0001466722898421873, + "loss": 0.8781, + "step": 4980 + }, + { + "epoch": 6.16573902288188, + "grad_norm": 0.419921875, + "learning_rate": 0.00014654488378062087, + "loss": 0.8809, + "step": 4985 + }, + { + "epoch": 6.171923314780457, + "grad_norm": 0.43359375, + "learning_rate": 0.00014641738119815266, + "loss": 0.885, + "step": 4990 + }, + { + "epoch": 6.178107606679035, + "grad_norm": 0.53125, + "learning_rate": 0.00014628978235918695, + "loss": 0.8823, + "step": 4995 + }, + { + "epoch": 6.184291898577613, + "grad_norm": 0.4609375, + "learning_rate": 0.00014616208752832758, + "loss": 0.8882, + "step": 5000 + }, + { + "epoch": 6.190476190476191, + "grad_norm": 0.42578125, + "learning_rate": 0.0001460342969703774, + "loss": 0.885, + "step": 5005 + }, + { + "epoch": 6.196660482374768, + "grad_norm": 0.490234375, + "learning_rate": 0.00014590641095033787, + "loss": 0.8901, + "step": 5010 + }, + { + "epoch": 6.202844774273346, + "grad_norm": 0.44140625, + "learning_rate": 0.0001457784297334083, + "loss": 0.8722, + "step": 5015 + }, + { + "epoch": 6.209029066171923, + "grad_norm": 0.419921875, + "learning_rate": 0.0001456503535849855, + "loss": 0.8808, + "step": 5020 + }, + { + "epoch": 6.215213358070501, + "grad_norm": 0.474609375, + "learning_rate": 0.00014552218277066314, + "loss": 0.8734, + "step": 5025 + }, + { + "epoch": 6.221397649969078, + "grad_norm": 0.4296875, + "learning_rate": 0.0001453939175562312, + "loss": 0.8814, + "step": 5030 + }, + { + "epoch": 6.227581941867657, + "grad_norm": 0.494140625, + "learning_rate": 0.00014526555820767534, + "loss": 0.8775, + "step": 5035 + }, + { + "epoch": 6.233766233766234, + "grad_norm": 0.515625, + "learning_rate": 0.00014513710499117647, + "loss": 0.8756, + "step": 5040 + }, + { + "epoch": 6.239950525664812, + "grad_norm": 0.484375, + "learning_rate": 0.0001450085581731102, + "loss": 0.8814, + "step": 5045 + }, + { + "epoch": 6.246134817563389, + "grad_norm": 0.4375, + "learning_rate": 0.00014487991802004623, + "loss": 0.8778, + "step": 5050 + }, + { + "epoch": 6.252319109461967, + "grad_norm": 0.56640625, + "learning_rate": 0.00014475118479874774, + "loss": 0.8744, + "step": 5055 + }, + { + "epoch": 6.258503401360544, + "grad_norm": 0.4140625, + "learning_rate": 0.00014462235877617098, + "loss": 0.876, + "step": 5060 + }, + { + "epoch": 6.264687693259122, + "grad_norm": 0.447265625, + "learning_rate": 0.00014449344021946468, + "loss": 0.888, + "step": 5065 + }, + { + "epoch": 6.270871985157699, + "grad_norm": 0.4609375, + "learning_rate": 0.0001443644293959693, + "loss": 0.8792, + "step": 5070 + }, + { + "epoch": 6.2770562770562774, + "grad_norm": 0.443359375, + "learning_rate": 0.0001442353265732168, + "loss": 0.8672, + "step": 5075 + }, + { + "epoch": 6.283240568954855, + "grad_norm": 0.466796875, + "learning_rate": 0.00014410613201892985, + "loss": 0.8773, + "step": 5080 + }, + { + "epoch": 6.289424860853432, + "grad_norm": 0.46875, + "learning_rate": 0.0001439768460010213, + "loss": 0.8835, + "step": 5085 + }, + { + "epoch": 6.29560915275201, + "grad_norm": 0.44921875, + "learning_rate": 0.0001438474687875938, + "loss": 0.8845, + "step": 5090 + }, + { + "epoch": 6.301793444650587, + "grad_norm": 0.4609375, + "learning_rate": 0.000143718000646939, + "loss": 0.8753, + "step": 5095 + }, + { + "epoch": 6.307977736549165, + "grad_norm": 0.4609375, + "learning_rate": 0.00014358844184753712, + "loss": 0.8719, + "step": 5100 + }, + { + "epoch": 6.314162028447742, + "grad_norm": 0.447265625, + "learning_rate": 0.00014345879265805644, + "loss": 0.8807, + "step": 5105 + }, + { + "epoch": 6.32034632034632, + "grad_norm": 0.443359375, + "learning_rate": 0.00014332905334735261, + "loss": 0.8832, + "step": 5110 + }, + { + "epoch": 6.326530612244898, + "grad_norm": 0.423828125, + "learning_rate": 0.00014319922418446824, + "loss": 0.8805, + "step": 5115 + }, + { + "epoch": 6.332714904143476, + "grad_norm": 0.474609375, + "learning_rate": 0.00014306930543863219, + "loss": 0.8839, + "step": 5120 + }, + { + "epoch": 6.338899196042053, + "grad_norm": 0.427734375, + "learning_rate": 0.0001429392973792592, + "loss": 0.8863, + "step": 5125 + }, + { + "epoch": 6.345083487940631, + "grad_norm": 0.439453125, + "learning_rate": 0.00014280920027594907, + "loss": 0.8783, + "step": 5130 + }, + { + "epoch": 6.351267779839208, + "grad_norm": 0.443359375, + "learning_rate": 0.0001426790143984864, + "loss": 0.8823, + "step": 5135 + }, + { + "epoch": 6.357452071737786, + "grad_norm": 0.423828125, + "learning_rate": 0.00014254874001683976, + "loss": 0.8857, + "step": 5140 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 0.419921875, + "learning_rate": 0.00014241837740116132, + "loss": 0.8865, + "step": 5145 + }, + { + "epoch": 6.369820655534941, + "grad_norm": 0.5078125, + "learning_rate": 0.00014228792682178623, + "loss": 0.8779, + "step": 5150 + }, + { + "epoch": 6.376004947433519, + "grad_norm": 0.5, + "learning_rate": 0.00014215738854923203, + "loss": 0.8813, + "step": 5155 + }, + { + "epoch": 6.382189239332097, + "grad_norm": 0.45703125, + "learning_rate": 0.00014202676285419812, + "loss": 0.8824, + "step": 5160 + }, + { + "epoch": 6.388373531230674, + "grad_norm": 0.439453125, + "learning_rate": 0.00014189605000756514, + "loss": 0.8897, + "step": 5165 + }, + { + "epoch": 6.394557823129252, + "grad_norm": 0.423828125, + "learning_rate": 0.00014176525028039452, + "loss": 0.8711, + "step": 5170 + }, + { + "epoch": 6.400742115027829, + "grad_norm": 0.455078125, + "learning_rate": 0.00014163436394392786, + "loss": 0.8929, + "step": 5175 + }, + { + "epoch": 6.406926406926407, + "grad_norm": 0.4921875, + "learning_rate": 0.00014150339126958633, + "loss": 0.8739, + "step": 5180 + }, + { + "epoch": 6.413110698824984, + "grad_norm": 0.462890625, + "learning_rate": 0.0001413723325289701, + "loss": 0.876, + "step": 5185 + }, + { + "epoch": 6.419294990723563, + "grad_norm": 0.482421875, + "learning_rate": 0.00014124118799385796, + "loss": 0.8756, + "step": 5190 + }, + { + "epoch": 6.42547928262214, + "grad_norm": 0.431640625, + "learning_rate": 0.00014110995793620648, + "loss": 0.8882, + "step": 5195 + }, + { + "epoch": 6.431663574520718, + "grad_norm": 0.484375, + "learning_rate": 0.00014097864262814955, + "loss": 0.8771, + "step": 5200 + }, + { + "epoch": 6.437847866419295, + "grad_norm": 0.484375, + "learning_rate": 0.000140847242341998, + "loss": 0.8794, + "step": 5205 + }, + { + "epoch": 6.444032158317873, + "grad_norm": 0.50390625, + "learning_rate": 0.00014071575735023875, + "loss": 0.8804, + "step": 5210 + }, + { + "epoch": 6.45021645021645, + "grad_norm": 0.5, + "learning_rate": 0.00014058418792553445, + "loss": 0.874, + "step": 5215 + }, + { + "epoch": 6.456400742115028, + "grad_norm": 0.6171875, + "learning_rate": 0.0001404525343407228, + "loss": 0.8731, + "step": 5220 + }, + { + "epoch": 6.462585034013605, + "grad_norm": 0.462890625, + "learning_rate": 0.00014032079686881603, + "loss": 0.8762, + "step": 5225 + }, + { + "epoch": 6.4687693259121835, + "grad_norm": 0.48046875, + "learning_rate": 0.00014018897578300035, + "loss": 0.8906, + "step": 5230 + }, + { + "epoch": 6.474953617810761, + "grad_norm": 0.46875, + "learning_rate": 0.00014005707135663527, + "loss": 0.8737, + "step": 5235 + }, + { + "epoch": 6.4811379097093385, + "grad_norm": 0.408203125, + "learning_rate": 0.0001399250838632533, + "loss": 0.8806, + "step": 5240 + }, + { + "epoch": 6.487322201607916, + "grad_norm": 0.4140625, + "learning_rate": 0.00013979301357655905, + "loss": 0.8737, + "step": 5245 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 0.458984375, + "learning_rate": 0.0001396608607704289, + "loss": 0.878, + "step": 5250 + }, + { + "epoch": 6.499690785405071, + "grad_norm": 0.46484375, + "learning_rate": 0.00013952862571891034, + "loss": 0.8754, + "step": 5255 + }, + { + "epoch": 6.5058750773036484, + "grad_norm": 0.46875, + "learning_rate": 0.00013939630869622133, + "loss": 0.8818, + "step": 5260 + }, + { + "epoch": 6.512059369202227, + "grad_norm": 0.470703125, + "learning_rate": 0.00013926390997674997, + "loss": 0.8834, + "step": 5265 + }, + { + "epoch": 6.518243661100804, + "grad_norm": 0.5703125, + "learning_rate": 0.00013913142983505364, + "loss": 0.8787, + "step": 5270 + }, + { + "epoch": 6.524427952999382, + "grad_norm": 0.4375, + "learning_rate": 0.00013899886854585862, + "loss": 0.8813, + "step": 5275 + }, + { + "epoch": 6.530612244897959, + "grad_norm": 0.45703125, + "learning_rate": 0.00013886622638405952, + "loss": 0.8835, + "step": 5280 + }, + { + "epoch": 6.536796536796537, + "grad_norm": 0.416015625, + "learning_rate": 0.00013873350362471855, + "loss": 0.8811, + "step": 5285 + }, + { + "epoch": 6.542980828695114, + "grad_norm": 0.4765625, + "learning_rate": 0.00013860070054306516, + "loss": 0.891, + "step": 5290 + }, + { + "epoch": 6.549165120593692, + "grad_norm": 0.5546875, + "learning_rate": 0.00013846781741449525, + "loss": 0.8807, + "step": 5295 + }, + { + "epoch": 6.555349412492269, + "grad_norm": 0.45703125, + "learning_rate": 0.0001383348545145708, + "loss": 0.883, + "step": 5300 + }, + { + "epoch": 6.561533704390847, + "grad_norm": 0.431640625, + "learning_rate": 0.0001382018121190192, + "loss": 0.8807, + "step": 5305 + }, + { + "epoch": 6.567717996289425, + "grad_norm": 0.66796875, + "learning_rate": 0.0001380686905037327, + "loss": 0.8844, + "step": 5310 + }, + { + "epoch": 6.573902288188003, + "grad_norm": 0.431640625, + "learning_rate": 0.0001379354899447678, + "loss": 0.893, + "step": 5315 + }, + { + "epoch": 6.58008658008658, + "grad_norm": 0.48828125, + "learning_rate": 0.00013780221071834476, + "loss": 0.8825, + "step": 5320 + }, + { + "epoch": 6.586270871985158, + "grad_norm": 0.466796875, + "learning_rate": 0.00013766885310084688, + "loss": 0.8899, + "step": 5325 + }, + { + "epoch": 6.592455163883735, + "grad_norm": 0.43359375, + "learning_rate": 0.0001375354173688201, + "loss": 0.8774, + "step": 5330 + }, + { + "epoch": 6.598639455782313, + "grad_norm": 0.431640625, + "learning_rate": 0.00013740190379897226, + "loss": 0.8802, + "step": 5335 + }, + { + "epoch": 6.60482374768089, + "grad_norm": 0.416015625, + "learning_rate": 0.00013726831266817278, + "loss": 0.877, + "step": 5340 + }, + { + "epoch": 6.611008039579469, + "grad_norm": 0.4765625, + "learning_rate": 0.00013713464425345174, + "loss": 0.8791, + "step": 5345 + }, + { + "epoch": 6.617192331478046, + "grad_norm": 0.546875, + "learning_rate": 0.00013700089883199966, + "loss": 0.8822, + "step": 5350 + }, + { + "epoch": 6.623376623376624, + "grad_norm": 0.421875, + "learning_rate": 0.0001368670766811665, + "loss": 0.8786, + "step": 5355 + }, + { + "epoch": 6.629560915275201, + "grad_norm": 0.44140625, + "learning_rate": 0.0001367331780784616, + "loss": 0.876, + "step": 5360 + }, + { + "epoch": 6.635745207173779, + "grad_norm": 0.458984375, + "learning_rate": 0.0001365992033015527, + "loss": 0.875, + "step": 5365 + }, + { + "epoch": 6.641929499072356, + "grad_norm": 0.423828125, + "learning_rate": 0.00013646515262826552, + "loss": 0.8762, + "step": 5370 + }, + { + "epoch": 6.648113790970934, + "grad_norm": 0.494140625, + "learning_rate": 0.00013633102633658318, + "loss": 0.8862, + "step": 5375 + }, + { + "epoch": 6.654298082869511, + "grad_norm": 0.4609375, + "learning_rate": 0.00013619682470464558, + "loss": 0.889, + "step": 5380 + }, + { + "epoch": 6.660482374768089, + "grad_norm": 0.59375, + "learning_rate": 0.00013606254801074895, + "loss": 0.883, + "step": 5385 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.53515625, + "learning_rate": 0.00013592819653334505, + "loss": 0.8794, + "step": 5390 + }, + { + "epoch": 6.6728509585652445, + "grad_norm": 0.55859375, + "learning_rate": 0.0001357937705510408, + "loss": 0.8812, + "step": 5395 + }, + { + "epoch": 6.679035250463822, + "grad_norm": 0.50390625, + "learning_rate": 0.0001356592703425976, + "loss": 0.8742, + "step": 5400 + }, + { + "epoch": 6.6852195423623995, + "grad_norm": 0.51953125, + "learning_rate": 0.00013552469618693076, + "loss": 0.8759, + "step": 5405 + }, + { + "epoch": 6.691403834260977, + "grad_norm": 0.408203125, + "learning_rate": 0.00013539004836310894, + "loss": 0.8747, + "step": 5410 + }, + { + "epoch": 6.6975881261595545, + "grad_norm": 0.400390625, + "learning_rate": 0.00013525532715035366, + "loss": 0.8803, + "step": 5415 + }, + { + "epoch": 6.703772418058133, + "grad_norm": 0.4140625, + "learning_rate": 0.0001351205328280385, + "loss": 0.8769, + "step": 5420 + }, + { + "epoch": 6.70995670995671, + "grad_norm": 0.45703125, + "learning_rate": 0.00013498566567568865, + "loss": 0.8749, + "step": 5425 + }, + { + "epoch": 6.716141001855288, + "grad_norm": 0.435546875, + "learning_rate": 0.00013485072597298038, + "loss": 0.8708, + "step": 5430 + }, + { + "epoch": 6.722325293753865, + "grad_norm": 0.4375, + "learning_rate": 0.00013471571399974045, + "loss": 0.8736, + "step": 5435 + }, + { + "epoch": 6.728509585652443, + "grad_norm": 0.44140625, + "learning_rate": 0.00013458063003594543, + "loss": 0.8774, + "step": 5440 + }, + { + "epoch": 6.73469387755102, + "grad_norm": 0.58203125, + "learning_rate": 0.00013444547436172117, + "loss": 0.877, + "step": 5445 + }, + { + "epoch": 6.740878169449598, + "grad_norm": 0.515625, + "learning_rate": 0.0001343102472573423, + "loss": 0.8998, + "step": 5450 + }, + { + "epoch": 6.747062461348175, + "grad_norm": 0.470703125, + "learning_rate": 0.00013417494900323142, + "loss": 0.8903, + "step": 5455 + }, + { + "epoch": 6.753246753246753, + "grad_norm": 0.5078125, + "learning_rate": 0.00013403957987995882, + "loss": 0.8768, + "step": 5460 + }, + { + "epoch": 6.759431045145331, + "grad_norm": 0.470703125, + "learning_rate": 0.00013390414016824174, + "loss": 0.8859, + "step": 5465 + }, + { + "epoch": 6.765615337043909, + "grad_norm": 0.43359375, + "learning_rate": 0.00013376863014894375, + "loss": 0.8784, + "step": 5470 + }, + { + "epoch": 6.771799628942486, + "grad_norm": 0.47265625, + "learning_rate": 0.00013363305010307425, + "loss": 0.8793, + "step": 5475 + }, + { + "epoch": 6.777983920841064, + "grad_norm": 0.46875, + "learning_rate": 0.00013349740031178784, + "loss": 0.8688, + "step": 5480 + }, + { + "epoch": 6.784168212739641, + "grad_norm": 0.47265625, + "learning_rate": 0.0001333616810563837, + "loss": 0.8727, + "step": 5485 + }, + { + "epoch": 6.790352504638219, + "grad_norm": 0.439453125, + "learning_rate": 0.00013322589261830517, + "loss": 0.8718, + "step": 5490 + }, + { + "epoch": 6.796536796536796, + "grad_norm": 0.455078125, + "learning_rate": 0.000133090035279139, + "loss": 0.8866, + "step": 5495 + }, + { + "epoch": 6.802721088435375, + "grad_norm": 0.52734375, + "learning_rate": 0.00013295410932061478, + "loss": 0.8834, + "step": 5500 + }, + { + "epoch": 6.808905380333952, + "grad_norm": 0.443359375, + "learning_rate": 0.0001328181150246045, + "loss": 0.8827, + "step": 5505 + }, + { + "epoch": 6.81508967223253, + "grad_norm": 0.5234375, + "learning_rate": 0.00013268205267312174, + "loss": 0.8842, + "step": 5510 + }, + { + "epoch": 6.821273964131107, + "grad_norm": 0.462890625, + "learning_rate": 0.0001325459225483213, + "loss": 0.8813, + "step": 5515 + }, + { + "epoch": 6.827458256029685, + "grad_norm": 0.484375, + "learning_rate": 0.00013240972493249847, + "loss": 0.8872, + "step": 5520 + }, + { + "epoch": 6.833642547928262, + "grad_norm": 0.443359375, + "learning_rate": 0.0001322734601080885, + "loss": 0.8884, + "step": 5525 + }, + { + "epoch": 6.83982683982684, + "grad_norm": 0.458984375, + "learning_rate": 0.00013213712835766607, + "loss": 0.8877, + "step": 5530 + }, + { + "epoch": 6.846011131725417, + "grad_norm": 0.48828125, + "learning_rate": 0.0001320007299639446, + "loss": 0.8777, + "step": 5535 + }, + { + "epoch": 6.852195423623995, + "grad_norm": 0.427734375, + "learning_rate": 0.0001318642652097757, + "loss": 0.8732, + "step": 5540 + }, + { + "epoch": 6.858379715522573, + "grad_norm": 0.41015625, + "learning_rate": 0.00013172773437814865, + "loss": 0.8859, + "step": 5545 + }, + { + "epoch": 6.8645640074211505, + "grad_norm": 0.46875, + "learning_rate": 0.00013159113775218964, + "loss": 0.8738, + "step": 5550 + }, + { + "epoch": 6.870748299319728, + "grad_norm": 0.447265625, + "learning_rate": 0.00013145447561516138, + "loss": 0.8849, + "step": 5555 + }, + { + "epoch": 6.8769325912183055, + "grad_norm": 0.462890625, + "learning_rate": 0.00013131774825046245, + "loss": 0.8766, + "step": 5560 + }, + { + "epoch": 6.883116883116883, + "grad_norm": 0.447265625, + "learning_rate": 0.0001311809559416267, + "loss": 0.881, + "step": 5565 + }, + { + "epoch": 6.8893011750154605, + "grad_norm": 0.51953125, + "learning_rate": 0.00013104409897232258, + "loss": 0.8833, + "step": 5570 + }, + { + "epoch": 6.895485466914038, + "grad_norm": 0.44921875, + "learning_rate": 0.00013090717762635266, + "loss": 0.8787, + "step": 5575 + }, + { + "epoch": 6.901669758812616, + "grad_norm": 0.443359375, + "learning_rate": 0.00013077019218765305, + "loss": 0.8843, + "step": 5580 + }, + { + "epoch": 6.907854050711194, + "grad_norm": 0.4765625, + "learning_rate": 0.0001306331429402927, + "loss": 0.8901, + "step": 5585 + }, + { + "epoch": 6.914038342609771, + "grad_norm": 0.55078125, + "learning_rate": 0.00013049603016847296, + "loss": 0.8918, + "step": 5590 + }, + { + "epoch": 6.920222634508349, + "grad_norm": 0.50390625, + "learning_rate": 0.00013035885415652685, + "loss": 0.8775, + "step": 5595 + }, + { + "epoch": 6.926406926406926, + "grad_norm": 0.57421875, + "learning_rate": 0.00013022161518891855, + "loss": 0.8947, + "step": 5600 + }, + { + "epoch": 6.932591218305504, + "grad_norm": 0.470703125, + "learning_rate": 0.00013008431355024283, + "loss": 0.8815, + "step": 5605 + }, + { + "epoch": 6.938775510204081, + "grad_norm": 0.49609375, + "learning_rate": 0.00012994694952522435, + "loss": 0.8678, + "step": 5610 + }, + { + "epoch": 6.944959802102659, + "grad_norm": 0.478515625, + "learning_rate": 0.00012980952339871718, + "loss": 0.8793, + "step": 5615 + }, + { + "epoch": 6.951144094001237, + "grad_norm": 0.484375, + "learning_rate": 0.00012967203545570418, + "loss": 0.8782, + "step": 5620 + }, + { + "epoch": 6.957328385899815, + "grad_norm": 0.419921875, + "learning_rate": 0.00012953448598129643, + "loss": 0.8754, + "step": 5625 + }, + { + "epoch": 6.963512677798392, + "grad_norm": 0.5078125, + "learning_rate": 0.0001293968752607325, + "loss": 0.8751, + "step": 5630 + }, + { + "epoch": 6.96969696969697, + "grad_norm": 0.47265625, + "learning_rate": 0.00012925920357937808, + "loss": 0.8816, + "step": 5635 + }, + { + "epoch": 6.975881261595547, + "grad_norm": 0.46875, + "learning_rate": 0.00012912147122272523, + "loss": 0.8865, + "step": 5640 + }, + { + "epoch": 6.982065553494125, + "grad_norm": 0.45703125, + "learning_rate": 0.0001289836784763918, + "loss": 0.8764, + "step": 5645 + }, + { + "epoch": 6.988249845392702, + "grad_norm": 0.54296875, + "learning_rate": 0.00012884582562612095, + "loss": 0.8862, + "step": 5650 + }, + { + "epoch": 6.994434137291281, + "grad_norm": 0.609375, + "learning_rate": 0.0001287079129577804, + "loss": 0.8795, + "step": 5655 + }, + { + "epoch": 6.999381570810142, + "eval_loss": 2.5032334327697754, + "eval_runtime": 0.6588, + "eval_samples_per_second": 15.178, + "eval_steps_per_second": 1.518, + "step": 5659 + }, + { + "epoch": 7.000618429189858, + "grad_norm": 0.51953125, + "learning_rate": 0.00012856994075736197, + "loss": 0.8769, + "step": 5660 + }, + { + "epoch": 7.006802721088436, + "grad_norm": 0.482421875, + "learning_rate": 0.00012843190931098093, + "loss": 0.8531, + "step": 5665 + }, + { + "epoch": 7.012987012987013, + "grad_norm": 0.439453125, + "learning_rate": 0.00012829381890487536, + "loss": 0.8766, + "step": 5670 + }, + { + "epoch": 7.019171304885591, + "grad_norm": 0.5078125, + "learning_rate": 0.00012815566982540567, + "loss": 0.8794, + "step": 5675 + }, + { + "epoch": 7.025355596784168, + "grad_norm": 0.455078125, + "learning_rate": 0.00012801746235905384, + "loss": 0.8736, + "step": 5680 + }, + { + "epoch": 7.031539888682746, + "grad_norm": 0.48828125, + "learning_rate": 0.00012787919679242306, + "loss": 0.8789, + "step": 5685 + }, + { + "epoch": 7.037724180581323, + "grad_norm": 0.51953125, + "learning_rate": 0.00012774087341223695, + "loss": 0.8716, + "step": 5690 + }, + { + "epoch": 7.0439084724799015, + "grad_norm": 0.482421875, + "learning_rate": 0.000127602492505339, + "loss": 0.8688, + "step": 5695 + }, + { + "epoch": 7.050092764378479, + "grad_norm": 0.51953125, + "learning_rate": 0.00012746405435869198, + "loss": 0.8736, + "step": 5700 + }, + { + "epoch": 7.0562770562770565, + "grad_norm": 0.455078125, + "learning_rate": 0.0001273255592593774, + "loss": 0.8623, + "step": 5705 + }, + { + "epoch": 7.062461348175634, + "grad_norm": 0.5078125, + "learning_rate": 0.00012718700749459486, + "loss": 0.8647, + "step": 5710 + }, + { + "epoch": 7.0686456400742115, + "grad_norm": 0.59375, + "learning_rate": 0.00012704839935166143, + "loss": 0.8648, + "step": 5715 + }, + { + "epoch": 7.074829931972789, + "grad_norm": 0.45703125, + "learning_rate": 0.0001269097351180112, + "loss": 0.8635, + "step": 5720 + }, + { + "epoch": 7.0810142238713665, + "grad_norm": 0.474609375, + "learning_rate": 0.00012677101508119445, + "loss": 0.8724, + "step": 5725 + }, + { + "epoch": 7.087198515769944, + "grad_norm": 0.54296875, + "learning_rate": 0.00012663223952887723, + "loss": 0.8703, + "step": 5730 + }, + { + "epoch": 7.093382807668522, + "grad_norm": 0.43359375, + "learning_rate": 0.00012649340874884075, + "loss": 0.8688, + "step": 5735 + }, + { + "epoch": 7.0995670995671, + "grad_norm": 0.5234375, + "learning_rate": 0.0001263545230289807, + "loss": 0.8691, + "step": 5740 + }, + { + "epoch": 7.105751391465677, + "grad_norm": 0.515625, + "learning_rate": 0.0001262155826573067, + "loss": 0.8708, + "step": 5745 + }, + { + "epoch": 7.111935683364255, + "grad_norm": 0.4453125, + "learning_rate": 0.00012607658792194174, + "loss": 0.872, + "step": 5750 + }, + { + "epoch": 7.118119975262832, + "grad_norm": 0.427734375, + "learning_rate": 0.0001259375391111215, + "loss": 0.8774, + "step": 5755 + }, + { + "epoch": 7.12430426716141, + "grad_norm": 0.4609375, + "learning_rate": 0.0001257984365131938, + "loss": 0.8759, + "step": 5760 + }, + { + "epoch": 7.130488559059987, + "grad_norm": 0.466796875, + "learning_rate": 0.0001256592804166181, + "loss": 0.8642, + "step": 5765 + }, + { + "epoch": 7.136672850958565, + "grad_norm": 0.44140625, + "learning_rate": 0.00012552007110996463, + "loss": 0.8667, + "step": 5770 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.48828125, + "learning_rate": 0.00012538080888191408, + "loss": 0.8672, + "step": 5775 + }, + { + "epoch": 7.149041434755721, + "grad_norm": 0.4296875, + "learning_rate": 0.00012524149402125685, + "loss": 0.8582, + "step": 5780 + }, + { + "epoch": 7.155225726654298, + "grad_norm": 0.515625, + "learning_rate": 0.0001251021268168925, + "loss": 0.8789, + "step": 5785 + }, + { + "epoch": 7.161410018552876, + "grad_norm": 0.51953125, + "learning_rate": 0.00012496270755782914, + "loss": 0.8787, + "step": 5790 + }, + { + "epoch": 7.167594310451453, + "grad_norm": 0.53125, + "learning_rate": 0.00012482323653318278, + "loss": 0.8723, + "step": 5795 + }, + { + "epoch": 7.173778602350031, + "grad_norm": 0.46484375, + "learning_rate": 0.00012468371403217684, + "loss": 0.8649, + "step": 5800 + }, + { + "epoch": 7.179962894248608, + "grad_norm": 0.64453125, + "learning_rate": 0.00012454414034414142, + "loss": 0.877, + "step": 5805 + }, + { + "epoch": 7.186147186147186, + "grad_norm": 0.466796875, + "learning_rate": 0.00012440451575851285, + "loss": 0.8851, + "step": 5810 + }, + { + "epoch": 7.192331478045764, + "grad_norm": 0.4375, + "learning_rate": 0.00012426484056483292, + "loss": 0.8703, + "step": 5815 + }, + { + "epoch": 7.198515769944342, + "grad_norm": 0.486328125, + "learning_rate": 0.00012412511505274844, + "loss": 0.8664, + "step": 5820 + }, + { + "epoch": 7.204700061842919, + "grad_norm": 0.458984375, + "learning_rate": 0.00012398533951201053, + "loss": 0.8617, + "step": 5825 + }, + { + "epoch": 7.210884353741497, + "grad_norm": 0.427734375, + "learning_rate": 0.00012384551423247407, + "loss": 0.8691, + "step": 5830 + }, + { + "epoch": 7.217068645640074, + "grad_norm": 0.48828125, + "learning_rate": 0.00012370563950409703, + "loss": 0.8808, + "step": 5835 + }, + { + "epoch": 7.223252937538652, + "grad_norm": 0.494140625, + "learning_rate": 0.00012356571561693996, + "loss": 0.8674, + "step": 5840 + }, + { + "epoch": 7.229437229437229, + "grad_norm": 0.435546875, + "learning_rate": 0.00012342574286116544, + "loss": 0.8644, + "step": 5845 + }, + { + "epoch": 7.235621521335807, + "grad_norm": 0.49609375, + "learning_rate": 0.00012328572152703725, + "loss": 0.8745, + "step": 5850 + }, + { + "epoch": 7.241805813234385, + "grad_norm": 0.4765625, + "learning_rate": 0.00012314565190491998, + "loss": 0.8642, + "step": 5855 + }, + { + "epoch": 7.2479901051329625, + "grad_norm": 0.51171875, + "learning_rate": 0.00012300553428527832, + "loss": 0.8785, + "step": 5860 + }, + { + "epoch": 7.25417439703154, + "grad_norm": 0.44140625, + "learning_rate": 0.00012286536895867654, + "loss": 0.8719, + "step": 5865 + }, + { + "epoch": 7.2603586889301175, + "grad_norm": 0.427734375, + "learning_rate": 0.00012272515621577782, + "loss": 0.8651, + "step": 5870 + }, + { + "epoch": 7.266542980828695, + "grad_norm": 0.48046875, + "learning_rate": 0.00012258489634734367, + "loss": 0.87, + "step": 5875 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 0.462890625, + "learning_rate": 0.00012244458964423327, + "loss": 0.8693, + "step": 5880 + }, + { + "epoch": 7.27891156462585, + "grad_norm": 0.46484375, + "learning_rate": 0.000122304236397403, + "loss": 0.8658, + "step": 5885 + }, + { + "epoch": 7.285095856524428, + "grad_norm": 0.458984375, + "learning_rate": 0.00012216383689790574, + "loss": 0.8749, + "step": 5890 + }, + { + "epoch": 7.291280148423006, + "grad_norm": 0.408203125, + "learning_rate": 0.00012202339143689023, + "loss": 0.8737, + "step": 5895 + }, + { + "epoch": 7.297464440321583, + "grad_norm": 0.484375, + "learning_rate": 0.00012188290030560063, + "loss": 0.871, + "step": 5900 + }, + { + "epoch": 7.303648732220161, + "grad_norm": 0.44921875, + "learning_rate": 0.00012174236379537572, + "loss": 0.8725, + "step": 5905 + }, + { + "epoch": 7.309833024118738, + "grad_norm": 0.451171875, + "learning_rate": 0.00012160178219764837, + "loss": 0.8718, + "step": 5910 + }, + { + "epoch": 7.316017316017316, + "grad_norm": 0.44921875, + "learning_rate": 0.00012146115580394499, + "loss": 0.8763, + "step": 5915 + }, + { + "epoch": 7.322201607915893, + "grad_norm": 0.455078125, + "learning_rate": 0.00012132048490588492, + "loss": 0.8658, + "step": 5920 + }, + { + "epoch": 7.328385899814471, + "grad_norm": 0.466796875, + "learning_rate": 0.00012117976979517973, + "loss": 0.8633, + "step": 5925 + }, + { + "epoch": 7.334570191713049, + "grad_norm": 0.453125, + "learning_rate": 0.00012103901076363269, + "loss": 0.8827, + "step": 5930 + }, + { + "epoch": 7.340754483611627, + "grad_norm": 0.4765625, + "learning_rate": 0.00012089820810313812, + "loss": 0.8769, + "step": 5935 + }, + { + "epoch": 7.346938775510204, + "grad_norm": 0.52734375, + "learning_rate": 0.0001207573621056809, + "loss": 0.8767, + "step": 5940 + }, + { + "epoch": 7.353123067408782, + "grad_norm": 0.451171875, + "learning_rate": 0.00012061647306333568, + "loss": 0.8909, + "step": 5945 + }, + { + "epoch": 7.359307359307359, + "grad_norm": 0.4921875, + "learning_rate": 0.00012047554126826643, + "loss": 0.8697, + "step": 5950 + }, + { + "epoch": 7.365491651205937, + "grad_norm": 0.4296875, + "learning_rate": 0.00012033456701272576, + "loss": 0.8715, + "step": 5955 + }, + { + "epoch": 7.371675943104514, + "grad_norm": 0.4609375, + "learning_rate": 0.00012019355058905435, + "loss": 0.8731, + "step": 5960 + }, + { + "epoch": 7.377860235003092, + "grad_norm": 0.447265625, + "learning_rate": 0.00012005249228968032, + "loss": 0.8818, + "step": 5965 + }, + { + "epoch": 7.38404452690167, + "grad_norm": 0.466796875, + "learning_rate": 0.00011991139240711857, + "loss": 0.8656, + "step": 5970 + }, + { + "epoch": 7.390228818800248, + "grad_norm": 0.51171875, + "learning_rate": 0.00011977025123397033, + "loss": 0.8862, + "step": 5975 + }, + { + "epoch": 7.396413110698825, + "grad_norm": 0.49609375, + "learning_rate": 0.00011962906906292238, + "loss": 0.8726, + "step": 5980 + }, + { + "epoch": 7.402597402597403, + "grad_norm": 0.4765625, + "learning_rate": 0.00011948784618674653, + "loss": 0.8633, + "step": 5985 + }, + { + "epoch": 7.40878169449598, + "grad_norm": 0.45703125, + "learning_rate": 0.00011934658289829902, + "loss": 0.8705, + "step": 5990 + }, + { + "epoch": 7.414965986394558, + "grad_norm": 0.451171875, + "learning_rate": 0.00011920527949051991, + "loss": 0.8736, + "step": 5995 + }, + { + "epoch": 7.421150278293135, + "grad_norm": 0.462890625, + "learning_rate": 0.00011906393625643244, + "loss": 0.8721, + "step": 6000 + }, + { + "epoch": 7.427334570191713, + "grad_norm": 0.44921875, + "learning_rate": 0.00011892255348914239, + "loss": 0.8733, + "step": 6005 + }, + { + "epoch": 7.433518862090291, + "grad_norm": 0.435546875, + "learning_rate": 0.00011878113148183758, + "loss": 0.8753, + "step": 6010 + }, + { + "epoch": 7.4397031539888685, + "grad_norm": 0.4296875, + "learning_rate": 0.00011863967052778721, + "loss": 0.8805, + "step": 6015 + }, + { + "epoch": 7.445887445887446, + "grad_norm": 0.41796875, + "learning_rate": 0.00011849817092034118, + "loss": 0.881, + "step": 6020 + }, + { + "epoch": 7.4520717377860235, + "grad_norm": 0.44140625, + "learning_rate": 0.00011835663295292963, + "loss": 0.8815, + "step": 6025 + }, + { + "epoch": 7.458256029684601, + "grad_norm": 0.47265625, + "learning_rate": 0.00011821505691906216, + "loss": 0.8684, + "step": 6030 + }, + { + "epoch": 7.4644403215831785, + "grad_norm": 0.4453125, + "learning_rate": 0.00011807344311232738, + "loss": 0.88, + "step": 6035 + }, + { + "epoch": 7.470624613481756, + "grad_norm": 0.439453125, + "learning_rate": 0.00011793179182639218, + "loss": 0.8808, + "step": 6040 + }, + { + "epoch": 7.476808905380334, + "grad_norm": 0.484375, + "learning_rate": 0.0001177901033550012, + "loss": 0.8732, + "step": 6045 + }, + { + "epoch": 7.482993197278912, + "grad_norm": 0.408203125, + "learning_rate": 0.00011764837799197622, + "loss": 0.8767, + "step": 6050 + }, + { + "epoch": 7.489177489177489, + "grad_norm": 0.482421875, + "learning_rate": 0.00011750661603121544, + "loss": 0.8694, + "step": 6055 + }, + { + "epoch": 7.495361781076067, + "grad_norm": 0.4453125, + "learning_rate": 0.00011736481776669306, + "loss": 0.8702, + "step": 6060 + }, + { + "epoch": 7.501546072974644, + "grad_norm": 0.451171875, + "learning_rate": 0.00011722298349245844, + "loss": 0.8751, + "step": 6065 + }, + { + "epoch": 7.507730364873222, + "grad_norm": 0.43359375, + "learning_rate": 0.0001170811135026357, + "loss": 0.8758, + "step": 6070 + }, + { + "epoch": 7.513914656771799, + "grad_norm": 0.427734375, + "learning_rate": 0.00011693920809142305, + "loss": 0.8732, + "step": 6075 + }, + { + "epoch": 7.520098948670377, + "grad_norm": 0.4453125, + "learning_rate": 0.00011679726755309205, + "loss": 0.8649, + "step": 6080 + }, + { + "epoch": 7.526283240568954, + "grad_norm": 0.4453125, + "learning_rate": 0.00011665529218198721, + "loss": 0.8688, + "step": 6085 + }, + { + "epoch": 7.532467532467533, + "grad_norm": 0.455078125, + "learning_rate": 0.00011651328227252517, + "loss": 0.8727, + "step": 6090 + }, + { + "epoch": 7.53865182436611, + "grad_norm": 0.48828125, + "learning_rate": 0.0001163712381191943, + "loss": 0.8796, + "step": 6095 + }, + { + "epoch": 7.544836116264688, + "grad_norm": 0.44140625, + "learning_rate": 0.00011622916001655388, + "loss": 0.8723, + "step": 6100 + }, + { + "epoch": 7.551020408163265, + "grad_norm": 0.421875, + "learning_rate": 0.00011608704825923369, + "loss": 0.874, + "step": 6105 + }, + { + "epoch": 7.557204700061843, + "grad_norm": 0.44140625, + "learning_rate": 0.00011594490314193323, + "loss": 0.8792, + "step": 6110 + }, + { + "epoch": 7.56338899196042, + "grad_norm": 0.443359375, + "learning_rate": 0.00011580272495942119, + "loss": 0.8685, + "step": 6115 + }, + { + "epoch": 7.569573283858999, + "grad_norm": 0.43359375, + "learning_rate": 0.00011566051400653486, + "loss": 0.8656, + "step": 6120 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 0.42578125, + "learning_rate": 0.00011551827057817945, + "loss": 0.8799, + "step": 6125 + }, + { + "epoch": 7.581941867656154, + "grad_norm": 0.439453125, + "learning_rate": 0.00011537599496932752, + "loss": 0.8796, + "step": 6130 + }, + { + "epoch": 7.588126159554731, + "grad_norm": 0.515625, + "learning_rate": 0.00011523368747501839, + "loss": 0.8738, + "step": 6135 + }, + { + "epoch": 7.594310451453309, + "grad_norm": 0.53125, + "learning_rate": 0.00011509134839035748, + "loss": 0.8787, + "step": 6140 + }, + { + "epoch": 7.600494743351886, + "grad_norm": 0.453125, + "learning_rate": 0.00011494897801051574, + "loss": 0.8755, + "step": 6145 + }, + { + "epoch": 7.606679035250464, + "grad_norm": 0.48828125, + "learning_rate": 0.00011480657663072896, + "loss": 0.878, + "step": 6150 + }, + { + "epoch": 7.612863327149041, + "grad_norm": 0.58984375, + "learning_rate": 0.00011466414454629731, + "loss": 0.8772, + "step": 6155 + }, + { + "epoch": 7.619047619047619, + "grad_norm": 0.45703125, + "learning_rate": 0.0001145216820525845, + "loss": 0.8697, + "step": 6160 + }, + { + "epoch": 7.625231910946197, + "grad_norm": 0.6171875, + "learning_rate": 0.00011437918944501749, + "loss": 0.8816, + "step": 6165 + }, + { + "epoch": 7.6314162028447745, + "grad_norm": 0.470703125, + "learning_rate": 0.00011423666701908547, + "loss": 0.8607, + "step": 6170 + }, + { + "epoch": 7.637600494743352, + "grad_norm": 0.4375, + "learning_rate": 0.00011409411507033962, + "loss": 0.8783, + "step": 6175 + }, + { + "epoch": 7.6437847866419295, + "grad_norm": 0.5234375, + "learning_rate": 0.00011395153389439233, + "loss": 0.8765, + "step": 6180 + }, + { + "epoch": 7.649969078540507, + "grad_norm": 0.41796875, + "learning_rate": 0.00011380892378691646, + "loss": 0.8723, + "step": 6185 + }, + { + "epoch": 7.6561533704390845, + "grad_norm": 0.4921875, + "learning_rate": 0.00011366628504364509, + "loss": 0.8783, + "step": 6190 + }, + { + "epoch": 7.662337662337662, + "grad_norm": 0.49609375, + "learning_rate": 0.00011352361796037047, + "loss": 0.8764, + "step": 6195 + }, + { + "epoch": 7.66852195423624, + "grad_norm": 0.4609375, + "learning_rate": 0.00011338092283294377, + "loss": 0.8831, + "step": 6200 + }, + { + "epoch": 7.674706246134818, + "grad_norm": 0.447265625, + "learning_rate": 0.00011323819995727421, + "loss": 0.8728, + "step": 6205 + }, + { + "epoch": 7.680890538033395, + "grad_norm": 0.4609375, + "learning_rate": 0.00011309544962932862, + "loss": 0.8765, + "step": 6210 + }, + { + "epoch": 7.687074829931973, + "grad_norm": 0.4609375, + "learning_rate": 0.0001129526721451307, + "loss": 0.8725, + "step": 6215 + }, + { + "epoch": 7.69325912183055, + "grad_norm": 0.48046875, + "learning_rate": 0.00011280986780076057, + "loss": 0.8677, + "step": 6220 + }, + { + "epoch": 7.699443413729128, + "grad_norm": 0.50390625, + "learning_rate": 0.00011266703689235394, + "loss": 0.8769, + "step": 6225 + }, + { + "epoch": 7.705627705627705, + "grad_norm": 0.625, + "learning_rate": 0.00011252417971610163, + "loss": 0.8719, + "step": 6230 + }, + { + "epoch": 7.711811997526283, + "grad_norm": 0.474609375, + "learning_rate": 0.00011238129656824898, + "loss": 0.8735, + "step": 6235 + }, + { + "epoch": 7.71799628942486, + "grad_norm": 0.490234375, + "learning_rate": 0.00011223838774509514, + "loss": 0.8828, + "step": 6240 + }, + { + "epoch": 7.724180581323439, + "grad_norm": 0.546875, + "learning_rate": 0.00011209545354299251, + "loss": 0.8807, + "step": 6245 + }, + { + "epoch": 7.730364873222016, + "grad_norm": 0.52734375, + "learning_rate": 0.00011195249425834615, + "loss": 0.8844, + "step": 6250 + }, + { + "epoch": 7.736549165120594, + "grad_norm": 0.47265625, + "learning_rate": 0.00011180951018761314, + "loss": 0.8783, + "step": 6255 + }, + { + "epoch": 7.742733457019171, + "grad_norm": 0.45703125, + "learning_rate": 0.00011166650162730188, + "loss": 0.8608, + "step": 6260 + }, + { + "epoch": 7.748917748917749, + "grad_norm": 0.451171875, + "learning_rate": 0.0001115234688739716, + "loss": 0.8747, + "step": 6265 + }, + { + "epoch": 7.755102040816326, + "grad_norm": 0.484375, + "learning_rate": 0.00011138041222423177, + "loss": 0.8773, + "step": 6270 + }, + { + "epoch": 7.761286332714904, + "grad_norm": 0.47265625, + "learning_rate": 0.00011123733197474128, + "loss": 0.8672, + "step": 6275 + }, + { + "epoch": 7.767470624613482, + "grad_norm": 0.474609375, + "learning_rate": 0.00011109422842220805, + "loss": 0.8731, + "step": 6280 + }, + { + "epoch": 7.77365491651206, + "grad_norm": 0.5078125, + "learning_rate": 0.00011095110186338835, + "loss": 0.8684, + "step": 6285 + }, + { + "epoch": 7.779839208410637, + "grad_norm": 0.427734375, + "learning_rate": 0.00011080795259508608, + "loss": 0.8699, + "step": 6290 + }, + { + "epoch": 7.786023500309215, + "grad_norm": 0.427734375, + "learning_rate": 0.00011066478091415223, + "loss": 0.8673, + "step": 6295 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 0.462890625, + "learning_rate": 0.00011052158711748434, + "loss": 0.8707, + "step": 6300 + }, + { + "epoch": 7.79839208410637, + "grad_norm": 0.462890625, + "learning_rate": 0.00011037837150202576, + "loss": 0.8729, + "step": 6305 + }, + { + "epoch": 7.804576376004947, + "grad_norm": 0.4375, + "learning_rate": 0.00011023513436476511, + "loss": 0.8627, + "step": 6310 + }, + { + "epoch": 7.810760667903525, + "grad_norm": 0.4140625, + "learning_rate": 0.00011009187600273566, + "loss": 0.8787, + "step": 6315 + }, + { + "epoch": 7.816944959802103, + "grad_norm": 0.462890625, + "learning_rate": 0.00010994859671301462, + "loss": 0.8708, + "step": 6320 + }, + { + "epoch": 7.8231292517006805, + "grad_norm": 0.439453125, + "learning_rate": 0.00010980529679272266, + "loss": 0.8691, + "step": 6325 + }, + { + "epoch": 7.829313543599258, + "grad_norm": 0.515625, + "learning_rate": 0.0001096619765390232, + "loss": 0.8678, + "step": 6330 + }, + { + "epoch": 7.8354978354978355, + "grad_norm": 0.48046875, + "learning_rate": 0.00010951863624912185, + "loss": 0.8747, + "step": 6335 + }, + { + "epoch": 7.841682127396413, + "grad_norm": 0.455078125, + "learning_rate": 0.00010937527622026575, + "loss": 0.8727, + "step": 6340 + }, + { + "epoch": 7.8478664192949905, + "grad_norm": 0.470703125, + "learning_rate": 0.00010923189674974301, + "loss": 0.8742, + "step": 6345 + }, + { + "epoch": 7.854050711193568, + "grad_norm": 0.421875, + "learning_rate": 0.00010908849813488203, + "loss": 0.8675, + "step": 6350 + }, + { + "epoch": 7.860235003092146, + "grad_norm": 0.466796875, + "learning_rate": 0.00010894508067305088, + "loss": 0.8758, + "step": 6355 + }, + { + "epoch": 7.866419294990724, + "grad_norm": 0.474609375, + "learning_rate": 0.00010880164466165674, + "loss": 0.8716, + "step": 6360 + }, + { + "epoch": 7.872603586889301, + "grad_norm": 0.45703125, + "learning_rate": 0.00010865819039814526, + "loss": 0.8738, + "step": 6365 + }, + { + "epoch": 7.878787878787879, + "grad_norm": 0.47265625, + "learning_rate": 0.00010851471817999997, + "loss": 0.8819, + "step": 6370 + }, + { + "epoch": 7.884972170686456, + "grad_norm": 0.46484375, + "learning_rate": 0.00010837122830474158, + "loss": 0.8703, + "step": 6375 + }, + { + "epoch": 7.891156462585034, + "grad_norm": 0.439453125, + "learning_rate": 0.00010822772106992747, + "loss": 0.8778, + "step": 6380 + }, + { + "epoch": 7.897340754483611, + "grad_norm": 0.4609375, + "learning_rate": 0.00010808419677315093, + "loss": 0.8737, + "step": 6385 + }, + { + "epoch": 7.903525046382189, + "grad_norm": 0.4921875, + "learning_rate": 0.00010794065571204072, + "loss": 0.8775, + "step": 6390 + }, + { + "epoch": 7.909709338280766, + "grad_norm": 0.42578125, + "learning_rate": 0.00010779709818426033, + "loss": 0.8834, + "step": 6395 + }, + { + "epoch": 7.915893630179345, + "grad_norm": 0.41796875, + "learning_rate": 0.0001076535244875074, + "loss": 0.8678, + "step": 6400 + }, + { + "epoch": 7.922077922077922, + "grad_norm": 0.4375, + "learning_rate": 0.0001075099349195131, + "loss": 0.8648, + "step": 6405 + }, + { + "epoch": 7.9282622139765, + "grad_norm": 0.51953125, + "learning_rate": 0.00010736632977804149, + "loss": 0.8707, + "step": 6410 + }, + { + "epoch": 7.934446505875077, + "grad_norm": 0.41796875, + "learning_rate": 0.00010722270936088898, + "loss": 0.8705, + "step": 6415 + }, + { + "epoch": 7.940630797773655, + "grad_norm": 0.53515625, + "learning_rate": 0.00010707907396588361, + "loss": 0.8754, + "step": 6420 + }, + { + "epoch": 7.946815089672232, + "grad_norm": 0.51953125, + "learning_rate": 0.00010693542389088452, + "loss": 0.8723, + "step": 6425 + }, + { + "epoch": 7.95299938157081, + "grad_norm": 0.4453125, + "learning_rate": 0.00010679175943378119, + "loss": 0.8746, + "step": 6430 + }, + { + "epoch": 7.959183673469388, + "grad_norm": 0.515625, + "learning_rate": 0.00010664808089249305, + "loss": 0.8709, + "step": 6435 + }, + { + "epoch": 7.965367965367966, + "grad_norm": 0.66796875, + "learning_rate": 0.00010650438856496872, + "loss": 0.871, + "step": 6440 + }, + { + "epoch": 7.971552257266543, + "grad_norm": 0.455078125, + "learning_rate": 0.00010636068274918536, + "loss": 0.8661, + "step": 6445 + }, + { + "epoch": 7.977736549165121, + "grad_norm": 0.462890625, + "learning_rate": 0.00010621696374314807, + "loss": 0.8711, + "step": 6450 + }, + { + "epoch": 7.983920841063698, + "grad_norm": 0.51953125, + "learning_rate": 0.0001060732318448894, + "loss": 0.8759, + "step": 6455 + }, + { + "epoch": 7.990105132962276, + "grad_norm": 0.51171875, + "learning_rate": 0.00010592948735246854, + "loss": 0.8632, + "step": 6460 + }, + { + "epoch": 7.996289424860853, + "grad_norm": 0.45703125, + "learning_rate": 0.00010578573056397085, + "loss": 0.8694, + "step": 6465 + }, + { + "epoch": 8.0, + "eval_loss": 2.506842851638794, + "eval_runtime": 0.5393, + "eval_samples_per_second": 18.543, + "eval_steps_per_second": 1.854, + "step": 6468 + }, + { + "epoch": 8.00247371675943, + "grad_norm": 0.447265625, + "learning_rate": 0.00010564196177750725, + "loss": 0.8705, + "step": 6470 + }, + { + "epoch": 8.008658008658008, + "grad_norm": 0.478515625, + "learning_rate": 0.00010549818129121338, + "loss": 0.8568, + "step": 6475 + }, + { + "epoch": 8.014842300556586, + "grad_norm": 0.49609375, + "learning_rate": 0.0001053543894032493, + "loss": 0.8752, + "step": 6480 + }, + { + "epoch": 8.021026592455163, + "grad_norm": 0.474609375, + "learning_rate": 0.00010521058641179861, + "loss": 0.8689, + "step": 6485 + }, + { + "epoch": 8.02721088435374, + "grad_norm": 0.41015625, + "learning_rate": 0.00010506677261506797, + "loss": 0.8673, + "step": 6490 + }, + { + "epoch": 8.03339517625232, + "grad_norm": 0.490234375, + "learning_rate": 0.00010492294831128641, + "loss": 0.8757, + "step": 6495 + }, + { + "epoch": 8.039579468150897, + "grad_norm": 0.462890625, + "learning_rate": 0.00010477911379870488, + "loss": 0.8522, + "step": 6500 + }, + { + "epoch": 8.045763760049475, + "grad_norm": 0.47265625, + "learning_rate": 0.00010463526937559536, + "loss": 0.8703, + "step": 6505 + }, + { + "epoch": 8.051948051948052, + "grad_norm": 0.455078125, + "learning_rate": 0.00010449141534025045, + "loss": 0.8631, + "step": 6510 + }, + { + "epoch": 8.05813234384663, + "grad_norm": 0.46484375, + "learning_rate": 0.00010434755199098261, + "loss": 0.8672, + "step": 6515 + }, + { + "epoch": 8.064316635745207, + "grad_norm": 0.5078125, + "learning_rate": 0.00010420367962612372, + "loss": 0.8732, + "step": 6520 + }, + { + "epoch": 8.070500927643785, + "grad_norm": 0.4609375, + "learning_rate": 0.00010405979854402425, + "loss": 0.8687, + "step": 6525 + }, + { + "epoch": 8.076685219542362, + "grad_norm": 0.4453125, + "learning_rate": 0.00010391590904305284, + "loss": 0.8756, + "step": 6530 + }, + { + "epoch": 8.08286951144094, + "grad_norm": 0.451171875, + "learning_rate": 0.00010377201142159554, + "loss": 0.8664, + "step": 6535 + }, + { + "epoch": 8.089053803339517, + "grad_norm": 0.48046875, + "learning_rate": 0.00010362810597805526, + "loss": 0.867, + "step": 6540 + }, + { + "epoch": 8.095238095238095, + "grad_norm": 0.46484375, + "learning_rate": 0.00010348419301085113, + "loss": 0.8612, + "step": 6545 + }, + { + "epoch": 8.101422387136672, + "grad_norm": 0.5078125, + "learning_rate": 0.00010334027281841781, + "loss": 0.8646, + "step": 6550 + }, + { + "epoch": 8.10760667903525, + "grad_norm": 0.458984375, + "learning_rate": 0.00010319634569920504, + "loss": 0.8753, + "step": 6555 + }, + { + "epoch": 8.113790970933827, + "grad_norm": 0.4765625, + "learning_rate": 0.00010305241195167687, + "loss": 0.8595, + "step": 6560 + }, + { + "epoch": 8.119975262832405, + "grad_norm": 0.48046875, + "learning_rate": 0.00010290847187431113, + "loss": 0.864, + "step": 6565 + }, + { + "epoch": 8.126159554730984, + "grad_norm": 0.46875, + "learning_rate": 0.00010276452576559879, + "loss": 0.8822, + "step": 6570 + }, + { + "epoch": 8.132343846629562, + "grad_norm": 0.5, + "learning_rate": 0.00010262057392404328, + "loss": 0.8721, + "step": 6575 + }, + { + "epoch": 8.13852813852814, + "grad_norm": 0.43359375, + "learning_rate": 0.00010247661664815986, + "loss": 0.8611, + "step": 6580 + }, + { + "epoch": 8.144712430426717, + "grad_norm": 0.4765625, + "learning_rate": 0.00010233265423647523, + "loss": 0.8668, + "step": 6585 + }, + { + "epoch": 8.150896722325294, + "grad_norm": 0.455078125, + "learning_rate": 0.00010218868698752658, + "loss": 0.8696, + "step": 6590 + }, + { + "epoch": 8.157081014223872, + "grad_norm": 0.45703125, + "learning_rate": 0.00010204471519986124, + "loss": 0.8711, + "step": 6595 + }, + { + "epoch": 8.16326530612245, + "grad_norm": 0.416015625, + "learning_rate": 0.00010190073917203589, + "loss": 0.87, + "step": 6600 + }, + { + "epoch": 8.169449598021027, + "grad_norm": 0.52734375, + "learning_rate": 0.00010175675920261602, + "loss": 0.8702, + "step": 6605 + }, + { + "epoch": 8.175633889919604, + "grad_norm": 0.52734375, + "learning_rate": 0.00010161277559017528, + "loss": 0.864, + "step": 6610 + }, + { + "epoch": 8.181818181818182, + "grad_norm": 0.54296875, + "learning_rate": 0.00010146878863329492, + "loss": 0.8583, + "step": 6615 + }, + { + "epoch": 8.18800247371676, + "grad_norm": 0.474609375, + "learning_rate": 0.00010132479863056303, + "loss": 0.8689, + "step": 6620 + }, + { + "epoch": 8.194186765615337, + "grad_norm": 0.486328125, + "learning_rate": 0.00010118080588057416, + "loss": 0.8629, + "step": 6625 + }, + { + "epoch": 8.200371057513914, + "grad_norm": 0.466796875, + "learning_rate": 0.00010103681068192845, + "loss": 0.8656, + "step": 6630 + }, + { + "epoch": 8.206555349412492, + "grad_norm": 0.482421875, + "learning_rate": 0.00010089281333323112, + "loss": 0.8723, + "step": 6635 + }, + { + "epoch": 8.21273964131107, + "grad_norm": 0.443359375, + "learning_rate": 0.00010074881413309193, + "loss": 0.8689, + "step": 6640 + }, + { + "epoch": 8.218923933209647, + "grad_norm": 0.423828125, + "learning_rate": 0.00010060481338012435, + "loss": 0.8706, + "step": 6645 + }, + { + "epoch": 8.225108225108226, + "grad_norm": 0.44140625, + "learning_rate": 0.00010046081137294516, + "loss": 0.8672, + "step": 6650 + }, + { + "epoch": 8.231292517006803, + "grad_norm": 0.58203125, + "learning_rate": 0.00010031680841017377, + "loss": 0.8601, + "step": 6655 + }, + { + "epoch": 8.237476808905381, + "grad_norm": 0.490234375, + "learning_rate": 0.00010017280479043147, + "loss": 0.866, + "step": 6660 + }, + { + "epoch": 8.243661100803958, + "grad_norm": 0.5546875, + "learning_rate": 0.00010002880081234103, + "loss": 0.8567, + "step": 6665 + }, + { + "epoch": 8.249845392702536, + "grad_norm": 0.546875, + "learning_rate": 9.988479677452584e-05, + "loss": 0.8731, + "step": 6670 + }, + { + "epoch": 8.256029684601113, + "grad_norm": 0.451171875, + "learning_rate": 9.97407929756095e-05, + "loss": 0.8686, + "step": 6675 + }, + { + "epoch": 8.262213976499691, + "grad_norm": 0.4765625, + "learning_rate": 9.959678971421508e-05, + "loss": 0.8743, + "step": 6680 + }, + { + "epoch": 8.268398268398268, + "grad_norm": 0.45703125, + "learning_rate": 9.94527872889646e-05, + "loss": 0.8558, + "step": 6685 + }, + { + "epoch": 8.274582560296846, + "grad_norm": 0.466796875, + "learning_rate": 9.930878599847821e-05, + "loss": 0.8717, + "step": 6690 + }, + { + "epoch": 8.280766852195423, + "grad_norm": 0.46484375, + "learning_rate": 9.916478614137383e-05, + "loss": 0.8627, + "step": 6695 + }, + { + "epoch": 8.286951144094001, + "grad_norm": 0.470703125, + "learning_rate": 9.902078801626636e-05, + "loss": 0.8667, + "step": 6700 + }, + { + "epoch": 8.293135435992578, + "grad_norm": 0.50390625, + "learning_rate": 9.887679192176712e-05, + "loss": 0.8584, + "step": 6705 + }, + { + "epoch": 8.299319727891156, + "grad_norm": 0.455078125, + "learning_rate": 9.873279815648318e-05, + "loss": 0.8681, + "step": 6710 + }, + { + "epoch": 8.305504019789733, + "grad_norm": 0.44140625, + "learning_rate": 9.858880701901682e-05, + "loss": 0.8734, + "step": 6715 + }, + { + "epoch": 8.311688311688311, + "grad_norm": 0.44140625, + "learning_rate": 9.844481880796491e-05, + "loss": 0.8738, + "step": 6720 + }, + { + "epoch": 8.317872603586888, + "grad_norm": 0.45703125, + "learning_rate": 9.830083382191819e-05, + "loss": 0.8735, + "step": 6725 + }, + { + "epoch": 8.324056895485468, + "grad_norm": 0.44921875, + "learning_rate": 9.815685235946068e-05, + "loss": 0.8685, + "step": 6730 + }, + { + "epoch": 8.330241187384045, + "grad_norm": 0.447265625, + "learning_rate": 9.801287471916919e-05, + "loss": 0.8737, + "step": 6735 + }, + { + "epoch": 8.336425479282623, + "grad_norm": 0.44140625, + "learning_rate": 9.786890119961253e-05, + "loss": 0.8647, + "step": 6740 + }, + { + "epoch": 8.3426097711812, + "grad_norm": 0.486328125, + "learning_rate": 9.772493209935099e-05, + "loss": 0.8701, + "step": 6745 + }, + { + "epoch": 8.348794063079778, + "grad_norm": 0.455078125, + "learning_rate": 9.758096771693573e-05, + "loss": 0.8589, + "step": 6750 + }, + { + "epoch": 8.354978354978355, + "grad_norm": 0.4453125, + "learning_rate": 9.743700835090804e-05, + "loss": 0.8674, + "step": 6755 + }, + { + "epoch": 8.361162646876933, + "grad_norm": 0.474609375, + "learning_rate": 9.729305429979887e-05, + "loss": 0.8682, + "step": 6760 + }, + { + "epoch": 8.36734693877551, + "grad_norm": 0.439453125, + "learning_rate": 9.714910586212816e-05, + "loss": 0.8732, + "step": 6765 + }, + { + "epoch": 8.373531230674088, + "grad_norm": 0.462890625, + "learning_rate": 9.700516333640415e-05, + "loss": 0.8845, + "step": 6770 + }, + { + "epoch": 8.379715522572665, + "grad_norm": 0.439453125, + "learning_rate": 9.686122702112285e-05, + "loss": 0.8646, + "step": 6775 + }, + { + "epoch": 8.385899814471243, + "grad_norm": 0.451171875, + "learning_rate": 9.671729721476746e-05, + "loss": 0.8635, + "step": 6780 + }, + { + "epoch": 8.39208410636982, + "grad_norm": 0.470703125, + "learning_rate": 9.657337421580759e-05, + "loss": 0.8722, + "step": 6785 + }, + { + "epoch": 8.398268398268398, + "grad_norm": 0.423828125, + "learning_rate": 9.642945832269874e-05, + "loss": 0.8505, + "step": 6790 + }, + { + "epoch": 8.404452690166975, + "grad_norm": 0.439453125, + "learning_rate": 9.628554983388173e-05, + "loss": 0.865, + "step": 6795 + }, + { + "epoch": 8.410636982065553, + "grad_norm": 0.490234375, + "learning_rate": 9.614164904778196e-05, + "loss": 0.8605, + "step": 6800 + }, + { + "epoch": 8.416821273964132, + "grad_norm": 0.431640625, + "learning_rate": 9.599775626280892e-05, + "loss": 0.867, + "step": 6805 + }, + { + "epoch": 8.42300556586271, + "grad_norm": 0.443359375, + "learning_rate": 9.585387177735547e-05, + "loss": 0.8689, + "step": 6810 + }, + { + "epoch": 8.429189857761287, + "grad_norm": 0.4765625, + "learning_rate": 9.570999588979728e-05, + "loss": 0.8721, + "step": 6815 + }, + { + "epoch": 8.435374149659864, + "grad_norm": 0.47265625, + "learning_rate": 9.556612889849214e-05, + "loss": 0.8601, + "step": 6820 + }, + { + "epoch": 8.441558441558442, + "grad_norm": 0.466796875, + "learning_rate": 9.542227110177945e-05, + "loss": 0.8559, + "step": 6825 + }, + { + "epoch": 8.44774273345702, + "grad_norm": 0.45703125, + "learning_rate": 9.527842279797953e-05, + "loss": 0.866, + "step": 6830 + }, + { + "epoch": 8.453927025355597, + "grad_norm": 0.447265625, + "learning_rate": 9.513458428539298e-05, + "loss": 0.8635, + "step": 6835 + }, + { + "epoch": 8.460111317254174, + "grad_norm": 0.466796875, + "learning_rate": 9.499075586230013e-05, + "loss": 0.8694, + "step": 6840 + }, + { + "epoch": 8.466295609152752, + "grad_norm": 0.44140625, + "learning_rate": 9.484693782696041e-05, + "loss": 0.8716, + "step": 6845 + }, + { + "epoch": 8.47247990105133, + "grad_norm": 0.455078125, + "learning_rate": 9.470313047761167e-05, + "loss": 0.8623, + "step": 6850 + }, + { + "epoch": 8.478664192949907, + "grad_norm": 0.423828125, + "learning_rate": 9.455933411246958e-05, + "loss": 0.8603, + "step": 6855 + }, + { + "epoch": 8.484848484848484, + "grad_norm": 0.42578125, + "learning_rate": 9.44155490297271e-05, + "loss": 0.863, + "step": 6860 + }, + { + "epoch": 8.491032776747062, + "grad_norm": 0.447265625, + "learning_rate": 9.427177552755374e-05, + "loss": 0.8601, + "step": 6865 + }, + { + "epoch": 8.49721706864564, + "grad_norm": 0.443359375, + "learning_rate": 9.412801390409497e-05, + "loss": 0.8738, + "step": 6870 + }, + { + "epoch": 8.503401360544217, + "grad_norm": 0.50390625, + "learning_rate": 9.398426445747171e-05, + "loss": 0.8707, + "step": 6875 + }, + { + "epoch": 8.509585652442794, + "grad_norm": 0.60546875, + "learning_rate": 9.38405274857796e-05, + "loss": 0.8667, + "step": 6880 + }, + { + "epoch": 8.515769944341374, + "grad_norm": 0.48828125, + "learning_rate": 9.369680328708836e-05, + "loss": 0.876, + "step": 6885 + }, + { + "epoch": 8.521954236239951, + "grad_norm": 0.439453125, + "learning_rate": 9.355309215944124e-05, + "loss": 0.8676, + "step": 6890 + }, + { + "epoch": 8.528138528138529, + "grad_norm": 0.43359375, + "learning_rate": 9.340939440085445e-05, + "loss": 0.8488, + "step": 6895 + }, + { + "epoch": 8.534322820037106, + "grad_norm": 0.455078125, + "learning_rate": 9.326571030931637e-05, + "loss": 0.8724, + "step": 6900 + }, + { + "epoch": 8.540507111935684, + "grad_norm": 0.447265625, + "learning_rate": 9.312204018278716e-05, + "loss": 0.8657, + "step": 6905 + }, + { + "epoch": 8.546691403834261, + "grad_norm": 0.421875, + "learning_rate": 9.297838431919794e-05, + "loss": 0.8611, + "step": 6910 + }, + { + "epoch": 8.552875695732839, + "grad_norm": 0.443359375, + "learning_rate": 9.283474301645026e-05, + "loss": 0.8634, + "step": 6915 + }, + { + "epoch": 8.559059987631416, + "grad_norm": 0.474609375, + "learning_rate": 9.269111657241548e-05, + "loss": 0.8627, + "step": 6920 + }, + { + "epoch": 8.565244279529994, + "grad_norm": 0.462890625, + "learning_rate": 9.254750528493417e-05, + "loss": 0.8572, + "step": 6925 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.46875, + "learning_rate": 9.240390945181543e-05, + "loss": 0.8648, + "step": 6930 + }, + { + "epoch": 8.577612863327149, + "grad_norm": 0.490234375, + "learning_rate": 9.226032937083635e-05, + "loss": 0.8621, + "step": 6935 + }, + { + "epoch": 8.583797155225726, + "grad_norm": 0.427734375, + "learning_rate": 9.211676533974131e-05, + "loss": 0.8634, + "step": 6940 + }, + { + "epoch": 8.589981447124304, + "grad_norm": 0.4453125, + "learning_rate": 9.197321765624152e-05, + "loss": 0.8667, + "step": 6945 + }, + { + "epoch": 8.596165739022881, + "grad_norm": 0.45703125, + "learning_rate": 9.182968661801412e-05, + "loss": 0.8707, + "step": 6950 + }, + { + "epoch": 8.602350030921459, + "grad_norm": 0.46875, + "learning_rate": 9.168617252270183e-05, + "loss": 0.8772, + "step": 6955 + }, + { + "epoch": 8.608534322820038, + "grad_norm": 0.4453125, + "learning_rate": 9.154267566791223e-05, + "loss": 0.8584, + "step": 6960 + }, + { + "epoch": 8.614718614718615, + "grad_norm": 0.470703125, + "learning_rate": 9.139919635121714e-05, + "loss": 0.8657, + "step": 6965 + }, + { + "epoch": 8.620902906617193, + "grad_norm": 0.4375, + "learning_rate": 9.125573487015203e-05, + "loss": 0.8632, + "step": 6970 + }, + { + "epoch": 8.62708719851577, + "grad_norm": 0.45703125, + "learning_rate": 9.111229152221535e-05, + "loss": 0.8705, + "step": 6975 + }, + { + "epoch": 8.633271490414348, + "grad_norm": 0.458984375, + "learning_rate": 9.096886660486797e-05, + "loss": 0.869, + "step": 6980 + }, + { + "epoch": 8.639455782312925, + "grad_norm": 0.4375, + "learning_rate": 9.082546041553253e-05, + "loss": 0.875, + "step": 6985 + }, + { + "epoch": 8.645640074211503, + "grad_norm": 0.470703125, + "learning_rate": 9.068207325159284e-05, + "loss": 0.8667, + "step": 6990 + }, + { + "epoch": 8.65182436611008, + "grad_norm": 0.439453125, + "learning_rate": 9.053870541039327e-05, + "loss": 0.8749, + "step": 6995 + }, + { + "epoch": 8.658008658008658, + "grad_norm": 0.451171875, + "learning_rate": 9.039535718923804e-05, + "loss": 0.8608, + "step": 7000 + }, + { + "epoch": 8.664192949907235, + "grad_norm": 0.474609375, + "learning_rate": 9.02520288853908e-05, + "loss": 0.8621, + "step": 7005 + }, + { + "epoch": 8.670377241805813, + "grad_norm": 0.42578125, + "learning_rate": 9.01087207960739e-05, + "loss": 0.8716, + "step": 7010 + }, + { + "epoch": 8.67656153370439, + "grad_norm": 0.4609375, + "learning_rate": 8.996543321846759e-05, + "loss": 0.8588, + "step": 7015 + }, + { + "epoch": 8.682745825602968, + "grad_norm": 0.439453125, + "learning_rate": 8.982216644970979e-05, + "loss": 0.8595, + "step": 7020 + }, + { + "epoch": 8.688930117501545, + "grad_norm": 0.46484375, + "learning_rate": 8.967892078689513e-05, + "loss": 0.8813, + "step": 7025 + }, + { + "epoch": 8.695114409400123, + "grad_norm": 0.435546875, + "learning_rate": 8.953569652707459e-05, + "loss": 0.8618, + "step": 7030 + }, + { + "epoch": 8.7012987012987, + "grad_norm": 0.44921875, + "learning_rate": 8.939249396725467e-05, + "loss": 0.8558, + "step": 7035 + }, + { + "epoch": 8.70748299319728, + "grad_norm": 0.4765625, + "learning_rate": 8.924931340439694e-05, + "loss": 0.8712, + "step": 7040 + }, + { + "epoch": 8.713667285095857, + "grad_norm": 0.44921875, + "learning_rate": 8.910615513541729e-05, + "loss": 0.8496, + "step": 7045 + }, + { + "epoch": 8.719851576994435, + "grad_norm": 0.439453125, + "learning_rate": 8.896301945718541e-05, + "loss": 0.8614, + "step": 7050 + }, + { + "epoch": 8.726035868893012, + "grad_norm": 0.486328125, + "learning_rate": 8.881990666652417e-05, + "loss": 0.8733, + "step": 7055 + }, + { + "epoch": 8.73222016079159, + "grad_norm": 0.486328125, + "learning_rate": 8.867681706020894e-05, + "loss": 0.8747, + "step": 7060 + }, + { + "epoch": 8.738404452690167, + "grad_norm": 0.41015625, + "learning_rate": 8.853375093496699e-05, + "loss": 0.8717, + "step": 7065 + }, + { + "epoch": 8.744588744588745, + "grad_norm": 0.458984375, + "learning_rate": 8.839070858747697e-05, + "loss": 0.8632, + "step": 7070 + }, + { + "epoch": 8.750773036487322, + "grad_norm": 0.40625, + "learning_rate": 8.824769031436822e-05, + "loss": 0.867, + "step": 7075 + }, + { + "epoch": 8.7569573283859, + "grad_norm": 0.4921875, + "learning_rate": 8.810469641222001e-05, + "loss": 0.8627, + "step": 7080 + }, + { + "epoch": 8.763141620284477, + "grad_norm": 0.498046875, + "learning_rate": 8.796172717756124e-05, + "loss": 0.8723, + "step": 7085 + }, + { + "epoch": 8.769325912183055, + "grad_norm": 0.5078125, + "learning_rate": 8.781878290686959e-05, + "loss": 0.878, + "step": 7090 + }, + { + "epoch": 8.775510204081632, + "grad_norm": 0.5078125, + "learning_rate": 8.767586389657098e-05, + "loss": 0.8723, + "step": 7095 + }, + { + "epoch": 8.78169449598021, + "grad_norm": 0.49609375, + "learning_rate": 8.753297044303896e-05, + "loss": 0.8715, + "step": 7100 + }, + { + "epoch": 8.787878787878787, + "grad_norm": 0.439453125, + "learning_rate": 8.739010284259406e-05, + "loss": 0.8626, + "step": 7105 + }, + { + "epoch": 8.794063079777365, + "grad_norm": 0.427734375, + "learning_rate": 8.724726139150318e-05, + "loss": 0.8616, + "step": 7110 + }, + { + "epoch": 8.800247371675944, + "grad_norm": 0.4375, + "learning_rate": 8.710444638597905e-05, + "loss": 0.8726, + "step": 7115 + }, + { + "epoch": 8.806431663574521, + "grad_norm": 0.498046875, + "learning_rate": 8.696165812217953e-05, + "loss": 0.8644, + "step": 7120 + }, + { + "epoch": 8.812615955473099, + "grad_norm": 0.447265625, + "learning_rate": 8.6818896896207e-05, + "loss": 0.86, + "step": 7125 + }, + { + "epoch": 8.818800247371676, + "grad_norm": 0.47265625, + "learning_rate": 8.667616300410778e-05, + "loss": 0.871, + "step": 7130 + }, + { + "epoch": 8.824984539270254, + "grad_norm": 0.4609375, + "learning_rate": 8.653345674187157e-05, + "loss": 0.8669, + "step": 7135 + }, + { + "epoch": 8.831168831168831, + "grad_norm": 0.458984375, + "learning_rate": 8.639077840543077e-05, + "loss": 0.8533, + "step": 7140 + }, + { + "epoch": 8.837353123067409, + "grad_norm": 0.44921875, + "learning_rate": 8.62481282906597e-05, + "loss": 0.8853, + "step": 7145 + }, + { + "epoch": 8.843537414965986, + "grad_norm": 0.4296875, + "learning_rate": 8.610550669337433e-05, + "loss": 0.864, + "step": 7150 + }, + { + "epoch": 8.849721706864564, + "grad_norm": 0.4375, + "learning_rate": 8.596291390933147e-05, + "loss": 0.8616, + "step": 7155 + }, + { + "epoch": 8.855905998763141, + "grad_norm": 0.4375, + "learning_rate": 8.582035023422815e-05, + "loss": 0.8634, + "step": 7160 + }, + { + "epoch": 8.862090290661719, + "grad_norm": 0.416015625, + "learning_rate": 8.567781596370104e-05, + "loss": 0.872, + "step": 7165 + }, + { + "epoch": 8.868274582560296, + "grad_norm": 0.42578125, + "learning_rate": 8.553531139332582e-05, + "loss": 0.8525, + "step": 7170 + }, + { + "epoch": 8.874458874458874, + "grad_norm": 0.43359375, + "learning_rate": 8.539283681861661e-05, + "loss": 0.8559, + "step": 7175 + }, + { + "epoch": 8.880643166357451, + "grad_norm": 0.40625, + "learning_rate": 8.525039253502529e-05, + "loss": 0.8702, + "step": 7180 + }, + { + "epoch": 8.886827458256029, + "grad_norm": 0.45703125, + "learning_rate": 8.510797883794097e-05, + "loss": 0.867, + "step": 7185 + }, + { + "epoch": 8.893011750154606, + "grad_norm": 0.44140625, + "learning_rate": 8.496559602268928e-05, + "loss": 0.8573, + "step": 7190 + }, + { + "epoch": 8.899196042053186, + "grad_norm": 0.45703125, + "learning_rate": 8.482324438453187e-05, + "loss": 0.8687, + "step": 7195 + }, + { + "epoch": 8.905380333951763, + "grad_norm": 0.4765625, + "learning_rate": 8.468092421866573e-05, + "loss": 0.8658, + "step": 7200 + }, + { + "epoch": 8.91156462585034, + "grad_norm": 0.44921875, + "learning_rate": 8.45386358202225e-05, + "loss": 0.8594, + "step": 7205 + }, + { + "epoch": 8.917748917748918, + "grad_norm": 0.427734375, + "learning_rate": 8.439637948426801e-05, + "loss": 0.8699, + "step": 7210 + }, + { + "epoch": 8.923933209647496, + "grad_norm": 0.447265625, + "learning_rate": 8.425415550580162e-05, + "loss": 0.865, + "step": 7215 + }, + { + "epoch": 8.930117501546073, + "grad_norm": 0.4453125, + "learning_rate": 8.411196417975558e-05, + "loss": 0.8646, + "step": 7220 + }, + { + "epoch": 8.93630179344465, + "grad_norm": 0.43359375, + "learning_rate": 8.396980580099438e-05, + "loss": 0.8553, + "step": 7225 + }, + { + "epoch": 8.942486085343228, + "grad_norm": 0.478515625, + "learning_rate": 8.382768066431425e-05, + "loss": 0.8629, + "step": 7230 + }, + { + "epoch": 8.948670377241806, + "grad_norm": 0.427734375, + "learning_rate": 8.368558906444244e-05, + "loss": 0.8691, + "step": 7235 + }, + { + "epoch": 8.954854669140383, + "grad_norm": 0.4296875, + "learning_rate": 8.354353129603668e-05, + "loss": 0.8654, + "step": 7240 + }, + { + "epoch": 8.96103896103896, + "grad_norm": 0.421875, + "learning_rate": 8.340150765368452e-05, + "loss": 0.8607, + "step": 7245 + }, + { + "epoch": 8.967223252937538, + "grad_norm": 0.421875, + "learning_rate": 8.325951843190274e-05, + "loss": 0.8741, + "step": 7250 + }, + { + "epoch": 8.973407544836116, + "grad_norm": 0.447265625, + "learning_rate": 8.311756392513681e-05, + "loss": 0.8696, + "step": 7255 + }, + { + "epoch": 8.979591836734693, + "grad_norm": 0.59375, + "learning_rate": 8.297564442776014e-05, + "loss": 0.876, + "step": 7260 + }, + { + "epoch": 8.98577612863327, + "grad_norm": 0.4921875, + "learning_rate": 8.283376023407357e-05, + "loss": 0.8813, + "step": 7265 + }, + { + "epoch": 8.99196042053185, + "grad_norm": 0.447265625, + "learning_rate": 8.269191163830467e-05, + "loss": 0.8682, + "step": 7270 + }, + { + "epoch": 8.998144712430427, + "grad_norm": 0.439453125, + "learning_rate": 8.255009893460724e-05, + "loss": 0.8608, + "step": 7275 + }, + { + "epoch": 8.999381570810142, + "eval_loss": 2.514324903488159, + "eval_runtime": 0.6521, + "eval_samples_per_second": 15.335, + "eval_steps_per_second": 1.534, + "step": 7276 + }, + { + "epoch": 9.004329004329005, + "grad_norm": 0.443359375, + "learning_rate": 8.240832241706068e-05, + "loss": 0.8585, + "step": 7280 + }, + { + "epoch": 9.010513296227582, + "grad_norm": 0.423828125, + "learning_rate": 8.226658237966933e-05, + "loss": 0.8596, + "step": 7285 + }, + { + "epoch": 9.01669758812616, + "grad_norm": 0.466796875, + "learning_rate": 8.212487911636184e-05, + "loss": 0.8587, + "step": 7290 + }, + { + "epoch": 9.022881880024737, + "grad_norm": 0.48828125, + "learning_rate": 8.198321292099064e-05, + "loss": 0.8649, + "step": 7295 + }, + { + "epoch": 9.029066171923315, + "grad_norm": 0.431640625, + "learning_rate": 8.184158408733131e-05, + "loss": 0.8721, + "step": 7300 + }, + { + "epoch": 9.035250463821892, + "grad_norm": 0.44140625, + "learning_rate": 8.169999290908188e-05, + "loss": 0.861, + "step": 7305 + }, + { + "epoch": 9.04143475572047, + "grad_norm": 0.439453125, + "learning_rate": 8.155843967986236e-05, + "loss": 0.8489, + "step": 7310 + }, + { + "epoch": 9.047619047619047, + "grad_norm": 0.455078125, + "learning_rate": 8.141692469321403e-05, + "loss": 0.8582, + "step": 7315 + }, + { + "epoch": 9.053803339517625, + "grad_norm": 0.4296875, + "learning_rate": 8.127544824259889e-05, + "loss": 0.8654, + "step": 7320 + }, + { + "epoch": 9.059987631416202, + "grad_norm": 0.421875, + "learning_rate": 8.113401062139901e-05, + "loss": 0.8545, + "step": 7325 + }, + { + "epoch": 9.06617192331478, + "grad_norm": 0.439453125, + "learning_rate": 8.099261212291601e-05, + "loss": 0.8645, + "step": 7330 + }, + { + "epoch": 9.072356215213357, + "grad_norm": 0.498046875, + "learning_rate": 8.085125304037018e-05, + "loss": 0.869, + "step": 7335 + }, + { + "epoch": 9.078540507111935, + "grad_norm": 0.45703125, + "learning_rate": 8.070993366690029e-05, + "loss": 0.8699, + "step": 7340 + }, + { + "epoch": 9.084724799010512, + "grad_norm": 0.4609375, + "learning_rate": 8.056865429556267e-05, + "loss": 0.8558, + "step": 7345 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.453125, + "learning_rate": 8.042741521933071e-05, + "loss": 0.8524, + "step": 7350 + }, + { + "epoch": 9.09709338280767, + "grad_norm": 0.474609375, + "learning_rate": 8.028621673109425e-05, + "loss": 0.8664, + "step": 7355 + }, + { + "epoch": 9.103277674706247, + "grad_norm": 0.486328125, + "learning_rate": 8.014505912365893e-05, + "loss": 0.8638, + "step": 7360 + }, + { + "epoch": 9.109461966604824, + "grad_norm": 0.435546875, + "learning_rate": 8.000394268974563e-05, + "loss": 0.8599, + "step": 7365 + }, + { + "epoch": 9.115646258503402, + "grad_norm": 0.451171875, + "learning_rate": 7.986286772198986e-05, + "loss": 0.8649, + "step": 7370 + }, + { + "epoch": 9.12183055040198, + "grad_norm": 0.42578125, + "learning_rate": 7.972183451294112e-05, + "loss": 0.8691, + "step": 7375 + }, + { + "epoch": 9.128014842300557, + "grad_norm": 0.458984375, + "learning_rate": 7.958084335506239e-05, + "loss": 0.8663, + "step": 7380 + }, + { + "epoch": 9.134199134199134, + "grad_norm": 0.453125, + "learning_rate": 7.943989454072931e-05, + "loss": 0.8588, + "step": 7385 + }, + { + "epoch": 9.140383426097712, + "grad_norm": 0.462890625, + "learning_rate": 7.929898836222983e-05, + "loss": 0.8668, + "step": 7390 + }, + { + "epoch": 9.14656771799629, + "grad_norm": 0.4609375, + "learning_rate": 7.915812511176347e-05, + "loss": 0.8612, + "step": 7395 + }, + { + "epoch": 9.152752009894867, + "grad_norm": 0.462890625, + "learning_rate": 7.90173050814406e-05, + "loss": 0.8627, + "step": 7400 + }, + { + "epoch": 9.158936301793444, + "grad_norm": 0.421875, + "learning_rate": 7.887652856328214e-05, + "loss": 0.8531, + "step": 7405 + }, + { + "epoch": 9.165120593692022, + "grad_norm": 0.4609375, + "learning_rate": 7.873579584921869e-05, + "loss": 0.8705, + "step": 7410 + }, + { + "epoch": 9.1713048855906, + "grad_norm": 0.45703125, + "learning_rate": 7.859510723109003e-05, + "loss": 0.8737, + "step": 7415 + }, + { + "epoch": 9.177489177489177, + "grad_norm": 0.421875, + "learning_rate": 7.84544630006445e-05, + "loss": 0.8651, + "step": 7420 + }, + { + "epoch": 9.183673469387756, + "grad_norm": 0.455078125, + "learning_rate": 7.831386344953836e-05, + "loss": 0.8513, + "step": 7425 + }, + { + "epoch": 9.189857761286333, + "grad_norm": 0.447265625, + "learning_rate": 7.817330886933527e-05, + "loss": 0.8665, + "step": 7430 + }, + { + "epoch": 9.196042053184911, + "grad_norm": 0.447265625, + "learning_rate": 7.803279955150558e-05, + "loss": 0.8645, + "step": 7435 + }, + { + "epoch": 9.202226345083488, + "grad_norm": 0.4375, + "learning_rate": 7.789233578742582e-05, + "loss": 0.8693, + "step": 7440 + }, + { + "epoch": 9.208410636982066, + "grad_norm": 0.4375, + "learning_rate": 7.775191786837807e-05, + "loss": 0.8573, + "step": 7445 + }, + { + "epoch": 9.214594928880643, + "grad_norm": 0.427734375, + "learning_rate": 7.761154608554927e-05, + "loss": 0.8614, + "step": 7450 + }, + { + "epoch": 9.220779220779221, + "grad_norm": 0.5078125, + "learning_rate": 7.747122073003075e-05, + "loss": 0.8628, + "step": 7455 + }, + { + "epoch": 9.226963512677798, + "grad_norm": 0.439453125, + "learning_rate": 7.733094209281756e-05, + "loss": 0.8563, + "step": 7460 + }, + { + "epoch": 9.233147804576376, + "grad_norm": 0.44921875, + "learning_rate": 7.719071046480776e-05, + "loss": 0.8668, + "step": 7465 + }, + { + "epoch": 9.239332096474953, + "grad_norm": 0.43359375, + "learning_rate": 7.705052613680211e-05, + "loss": 0.8607, + "step": 7470 + }, + { + "epoch": 9.245516388373531, + "grad_norm": 0.439453125, + "learning_rate": 7.691038939950316e-05, + "loss": 0.8543, + "step": 7475 + }, + { + "epoch": 9.251700680272108, + "grad_norm": 0.44921875, + "learning_rate": 7.677030054351477e-05, + "loss": 0.858, + "step": 7480 + }, + { + "epoch": 9.257884972170686, + "grad_norm": 0.419921875, + "learning_rate": 7.663025985934158e-05, + "loss": 0.8605, + "step": 7485 + }, + { + "epoch": 9.264069264069263, + "grad_norm": 0.4609375, + "learning_rate": 7.649026763738827e-05, + "loss": 0.8628, + "step": 7490 + }, + { + "epoch": 9.270253555967841, + "grad_norm": 0.48046875, + "learning_rate": 7.635032416795905e-05, + "loss": 0.8618, + "step": 7495 + }, + { + "epoch": 9.276437847866418, + "grad_norm": 0.455078125, + "learning_rate": 7.6210429741257e-05, + "loss": 0.8675, + "step": 7500 + }, + { + "epoch": 9.282622139764998, + "grad_norm": 0.423828125, + "learning_rate": 7.607058464738357e-05, + "loss": 0.8614, + "step": 7505 + }, + { + "epoch": 9.288806431663575, + "grad_norm": 0.453125, + "learning_rate": 7.593078917633787e-05, + "loss": 0.8658, + "step": 7510 + }, + { + "epoch": 9.294990723562153, + "grad_norm": 0.443359375, + "learning_rate": 7.579104361801605e-05, + "loss": 0.8552, + "step": 7515 + }, + { + "epoch": 9.30117501546073, + "grad_norm": 0.4765625, + "learning_rate": 7.565134826221083e-05, + "loss": 0.8667, + "step": 7520 + }, + { + "epoch": 9.307359307359308, + "grad_norm": 0.4453125, + "learning_rate": 7.551170339861083e-05, + "loss": 0.8498, + "step": 7525 + }, + { + "epoch": 9.313543599257885, + "grad_norm": 0.48828125, + "learning_rate": 7.537210931679987e-05, + "loss": 0.8498, + "step": 7530 + }, + { + "epoch": 9.319727891156463, + "grad_norm": 0.466796875, + "learning_rate": 7.523256630625657e-05, + "loss": 0.8632, + "step": 7535 + }, + { + "epoch": 9.32591218305504, + "grad_norm": 0.412109375, + "learning_rate": 7.509307465635358e-05, + "loss": 0.8497, + "step": 7540 + }, + { + "epoch": 9.332096474953618, + "grad_norm": 0.466796875, + "learning_rate": 7.495363465635708e-05, + "loss": 0.8685, + "step": 7545 + }, + { + "epoch": 9.338280766852195, + "grad_norm": 0.423828125, + "learning_rate": 7.481424659542609e-05, + "loss": 0.8521, + "step": 7550 + }, + { + "epoch": 9.344465058750773, + "grad_norm": 0.41796875, + "learning_rate": 7.467491076261197e-05, + "loss": 0.8644, + "step": 7555 + }, + { + "epoch": 9.35064935064935, + "grad_norm": 0.431640625, + "learning_rate": 7.453562744685778e-05, + "loss": 0.8676, + "step": 7560 + }, + { + "epoch": 9.356833642547928, + "grad_norm": 0.44140625, + "learning_rate": 7.439639693699763e-05, + "loss": 0.8689, + "step": 7565 + }, + { + "epoch": 9.363017934446505, + "grad_norm": 0.451171875, + "learning_rate": 7.425721952175618e-05, + "loss": 0.8636, + "step": 7570 + }, + { + "epoch": 9.369202226345083, + "grad_norm": 0.427734375, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8494, + "step": 7575 + }, + { + "epoch": 9.375386518243662, + "grad_norm": 0.431640625, + "learning_rate": 7.39790251294767e-05, + "loss": 0.8693, + "step": 7580 + }, + { + "epoch": 9.38157081014224, + "grad_norm": 0.45703125, + "learning_rate": 7.384000872933506e-05, + "loss": 0.8667, + "step": 7585 + }, + { + "epoch": 9.387755102040817, + "grad_norm": 0.4375, + "learning_rate": 7.370104657760361e-05, + "loss": 0.8572, + "step": 7590 + }, + { + "epoch": 9.393939393939394, + "grad_norm": 0.47265625, + "learning_rate": 7.356213896245046e-05, + "loss": 0.8636, + "step": 7595 + }, + { + "epoch": 9.400123685837972, + "grad_norm": 0.43359375, + "learning_rate": 7.342328617193067e-05, + "loss": 0.8729, + "step": 7600 + }, + { + "epoch": 9.40630797773655, + "grad_norm": 0.435546875, + "learning_rate": 7.328448849398558e-05, + "loss": 0.8612, + "step": 7605 + }, + { + "epoch": 9.412492269635127, + "grad_norm": 0.439453125, + "learning_rate": 7.314574621644225e-05, + "loss": 0.8672, + "step": 7610 + }, + { + "epoch": 9.418676561533704, + "grad_norm": 0.431640625, + "learning_rate": 7.300705962701287e-05, + "loss": 0.8604, + "step": 7615 + }, + { + "epoch": 9.424860853432282, + "grad_norm": 0.4921875, + "learning_rate": 7.286842901329412e-05, + "loss": 0.854, + "step": 7620 + }, + { + "epoch": 9.43104514533086, + "grad_norm": 0.458984375, + "learning_rate": 7.272985466276661e-05, + "loss": 0.8513, + "step": 7625 + }, + { + "epoch": 9.437229437229437, + "grad_norm": 0.4375, + "learning_rate": 7.259133686279429e-05, + "loss": 0.8654, + "step": 7630 + }, + { + "epoch": 9.443413729128014, + "grad_norm": 0.443359375, + "learning_rate": 7.245287590062384e-05, + "loss": 0.8533, + "step": 7635 + }, + { + "epoch": 9.449598021026592, + "grad_norm": 0.50390625, + "learning_rate": 7.231447206338407e-05, + "loss": 0.8777, + "step": 7640 + }, + { + "epoch": 9.45578231292517, + "grad_norm": 0.4453125, + "learning_rate": 7.217612563808528e-05, + "loss": 0.8644, + "step": 7645 + }, + { + "epoch": 9.461966604823747, + "grad_norm": 0.4453125, + "learning_rate": 7.203783691161883e-05, + "loss": 0.8646, + "step": 7650 + }, + { + "epoch": 9.468150896722324, + "grad_norm": 0.4453125, + "learning_rate": 7.189960617075633e-05, + "loss": 0.8791, + "step": 7655 + }, + { + "epoch": 9.474335188620904, + "grad_norm": 0.412109375, + "learning_rate": 7.176143370214914e-05, + "loss": 0.8591, + "step": 7660 + }, + { + "epoch": 9.480519480519481, + "grad_norm": 0.45703125, + "learning_rate": 7.162331979232783e-05, + "loss": 0.8641, + "step": 7665 + }, + { + "epoch": 9.486703772418059, + "grad_norm": 0.43359375, + "learning_rate": 7.148526472770154e-05, + "loss": 0.8763, + "step": 7670 + }, + { + "epoch": 9.492888064316636, + "grad_norm": 0.50390625, + "learning_rate": 7.134726879455734e-05, + "loss": 0.8668, + "step": 7675 + }, + { + "epoch": 9.499072356215214, + "grad_norm": 0.43359375, + "learning_rate": 7.12093322790597e-05, + "loss": 0.8619, + "step": 7680 + }, + { + "epoch": 9.505256648113791, + "grad_norm": 0.427734375, + "learning_rate": 7.107145546724989e-05, + "loss": 0.8695, + "step": 7685 + }, + { + "epoch": 9.511440940012369, + "grad_norm": 0.4921875, + "learning_rate": 7.09336386450453e-05, + "loss": 0.8523, + "step": 7690 + }, + { + "epoch": 9.517625231910946, + "grad_norm": 0.455078125, + "learning_rate": 7.079588209823906e-05, + "loss": 0.8676, + "step": 7695 + }, + { + "epoch": 9.523809523809524, + "grad_norm": 0.44140625, + "learning_rate": 7.065818611249915e-05, + "loss": 0.8555, + "step": 7700 + }, + { + "epoch": 9.529993815708101, + "grad_norm": 0.4296875, + "learning_rate": 7.052055097336805e-05, + "loss": 0.8612, + "step": 7705 + }, + { + "epoch": 9.536178107606679, + "grad_norm": 0.447265625, + "learning_rate": 7.038297696626206e-05, + "loss": 0.8602, + "step": 7710 + }, + { + "epoch": 9.542362399505256, + "grad_norm": 0.431640625, + "learning_rate": 7.02454643764707e-05, + "loss": 0.8636, + "step": 7715 + }, + { + "epoch": 9.548546691403834, + "grad_norm": 0.423828125, + "learning_rate": 7.010801348915608e-05, + "loss": 0.8566, + "step": 7720 + }, + { + "epoch": 9.554730983302411, + "grad_norm": 0.45703125, + "learning_rate": 6.99706245893524e-05, + "loss": 0.8603, + "step": 7725 + }, + { + "epoch": 9.560915275200989, + "grad_norm": 0.515625, + "learning_rate": 6.983329796196534e-05, + "loss": 0.858, + "step": 7730 + }, + { + "epoch": 9.567099567099568, + "grad_norm": 0.455078125, + "learning_rate": 6.969603389177142e-05, + "loss": 0.858, + "step": 7735 + }, + { + "epoch": 9.573283858998145, + "grad_norm": 0.421875, + "learning_rate": 6.955883266341741e-05, + "loss": 0.8738, + "step": 7740 + }, + { + "epoch": 9.579468150896723, + "grad_norm": 0.421875, + "learning_rate": 6.94216945614198e-05, + "loss": 0.8643, + "step": 7745 + }, + { + "epoch": 9.5856524427953, + "grad_norm": 0.494140625, + "learning_rate": 6.928461987016413e-05, + "loss": 0.8708, + "step": 7750 + }, + { + "epoch": 9.591836734693878, + "grad_norm": 0.44921875, + "learning_rate": 6.914760887390452e-05, + "loss": 0.8582, + "step": 7755 + }, + { + "epoch": 9.598021026592455, + "grad_norm": 0.42578125, + "learning_rate": 6.901066185676295e-05, + "loss": 0.8593, + "step": 7760 + }, + { + "epoch": 9.604205318491033, + "grad_norm": 0.44140625, + "learning_rate": 6.887377910272869e-05, + "loss": 0.8639, + "step": 7765 + }, + { + "epoch": 9.61038961038961, + "grad_norm": 0.451171875, + "learning_rate": 6.873696089565786e-05, + "loss": 0.8507, + "step": 7770 + }, + { + "epoch": 9.616573902288188, + "grad_norm": 0.45703125, + "learning_rate": 6.860020751927259e-05, + "loss": 0.8571, + "step": 7775 + }, + { + "epoch": 9.622758194186765, + "grad_norm": 0.474609375, + "learning_rate": 6.846351925716068e-05, + "loss": 0.8637, + "step": 7780 + }, + { + "epoch": 9.628942486085343, + "grad_norm": 0.443359375, + "learning_rate": 6.832689639277484e-05, + "loss": 0.8621, + "step": 7785 + }, + { + "epoch": 9.63512677798392, + "grad_norm": 0.419921875, + "learning_rate": 6.819033920943219e-05, + "loss": 0.8631, + "step": 7790 + }, + { + "epoch": 9.641311069882498, + "grad_norm": 0.4375, + "learning_rate": 6.805384799031361e-05, + "loss": 0.8645, + "step": 7795 + }, + { + "epoch": 9.647495361781075, + "grad_norm": 0.439453125, + "learning_rate": 6.791742301846326e-05, + "loss": 0.8617, + "step": 7800 + }, + { + "epoch": 9.653679653679653, + "grad_norm": 0.44140625, + "learning_rate": 6.778106457678785e-05, + "loss": 0.8613, + "step": 7805 + }, + { + "epoch": 9.65986394557823, + "grad_norm": 0.447265625, + "learning_rate": 6.764477294805615e-05, + "loss": 0.8543, + "step": 7810 + }, + { + "epoch": 9.666048237476808, + "grad_norm": 0.42578125, + "learning_rate": 6.750854841489842e-05, + "loss": 0.8599, + "step": 7815 + }, + { + "epoch": 9.672232529375387, + "grad_norm": 0.427734375, + "learning_rate": 6.737239125980573e-05, + "loss": 0.8622, + "step": 7820 + }, + { + "epoch": 9.678416821273965, + "grad_norm": 0.4296875, + "learning_rate": 6.723630176512944e-05, + "loss": 0.8587, + "step": 7825 + }, + { + "epoch": 9.684601113172542, + "grad_norm": 0.451171875, + "learning_rate": 6.710028021308061e-05, + "loss": 0.8655, + "step": 7830 + }, + { + "epoch": 9.69078540507112, + "grad_norm": 0.4375, + "learning_rate": 6.696432688572937e-05, + "loss": 0.8626, + "step": 7835 + }, + { + "epoch": 9.696969696969697, + "grad_norm": 0.44921875, + "learning_rate": 6.682844206500445e-05, + "loss": 0.8685, + "step": 7840 + }, + { + "epoch": 9.703153988868275, + "grad_norm": 0.435546875, + "learning_rate": 6.669262603269246e-05, + "loss": 0.8599, + "step": 7845 + }, + { + "epoch": 9.709338280766852, + "grad_norm": 0.439453125, + "learning_rate": 6.655687907043734e-05, + "loss": 0.8549, + "step": 7850 + }, + { + "epoch": 9.71552257266543, + "grad_norm": 0.447265625, + "learning_rate": 6.642120145973985e-05, + "loss": 0.8692, + "step": 7855 + }, + { + "epoch": 9.721706864564007, + "grad_norm": 0.447265625, + "learning_rate": 6.62855934819569e-05, + "loss": 0.8663, + "step": 7860 + }, + { + "epoch": 9.727891156462585, + "grad_norm": 0.486328125, + "learning_rate": 6.615005541830103e-05, + "loss": 0.8571, + "step": 7865 + }, + { + "epoch": 9.734075448361162, + "grad_norm": 0.4453125, + "learning_rate": 6.601458754983978e-05, + "loss": 0.862, + "step": 7870 + }, + { + "epoch": 9.74025974025974, + "grad_norm": 0.419921875, + "learning_rate": 6.587919015749511e-05, + "loss": 0.8637, + "step": 7875 + }, + { + "epoch": 9.746444032158317, + "grad_norm": 0.4609375, + "learning_rate": 6.574386352204289e-05, + "loss": 0.8586, + "step": 7880 + }, + { + "epoch": 9.752628324056895, + "grad_norm": 0.44140625, + "learning_rate": 6.560860792411219e-05, + "loss": 0.8576, + "step": 7885 + }, + { + "epoch": 9.758812615955474, + "grad_norm": 0.435546875, + "learning_rate": 6.547342364418481e-05, + "loss": 0.8503, + "step": 7890 + }, + { + "epoch": 9.764996907854051, + "grad_norm": 0.419921875, + "learning_rate": 6.533831096259467e-05, + "loss": 0.8615, + "step": 7895 + }, + { + "epoch": 9.771181199752629, + "grad_norm": 0.44140625, + "learning_rate": 6.520327015952713e-05, + "loss": 0.8564, + "step": 7900 + }, + { + "epoch": 9.777365491651206, + "grad_norm": 0.447265625, + "learning_rate": 6.506830151501861e-05, + "loss": 0.8571, + "step": 7905 + }, + { + "epoch": 9.783549783549784, + "grad_norm": 0.439453125, + "learning_rate": 6.493340530895583e-05, + "loss": 0.8671, + "step": 7910 + }, + { + "epoch": 9.789734075448361, + "grad_norm": 0.451171875, + "learning_rate": 6.479858182107527e-05, + "loss": 0.8657, + "step": 7915 + }, + { + "epoch": 9.795918367346939, + "grad_norm": 0.47265625, + "learning_rate": 6.466383133096267e-05, + "loss": 0.8556, + "step": 7920 + }, + { + "epoch": 9.802102659245516, + "grad_norm": 0.4296875, + "learning_rate": 6.452915411805238e-05, + "loss": 0.8502, + "step": 7925 + }, + { + "epoch": 9.808286951144094, + "grad_norm": 0.42578125, + "learning_rate": 6.439455046162677e-05, + "loss": 0.8599, + "step": 7930 + }, + { + "epoch": 9.814471243042671, + "grad_norm": 0.44921875, + "learning_rate": 6.426002064081565e-05, + "loss": 0.8641, + "step": 7935 + }, + { + "epoch": 9.820655534941249, + "grad_norm": 0.5078125, + "learning_rate": 6.412556493459581e-05, + "loss": 0.8644, + "step": 7940 + }, + { + "epoch": 9.826839826839826, + "grad_norm": 0.43359375, + "learning_rate": 6.399118362179028e-05, + "loss": 0.8574, + "step": 7945 + }, + { + "epoch": 9.833024118738404, + "grad_norm": 0.451171875, + "learning_rate": 6.385687698106781e-05, + "loss": 0.8651, + "step": 7950 + }, + { + "epoch": 9.839208410636981, + "grad_norm": 0.494140625, + "learning_rate": 6.372264529094233e-05, + "loss": 0.8756, + "step": 7955 + }, + { + "epoch": 9.845392702535559, + "grad_norm": 0.46484375, + "learning_rate": 6.358848882977233e-05, + "loss": 0.8606, + "step": 7960 + }, + { + "epoch": 9.851576994434136, + "grad_norm": 0.470703125, + "learning_rate": 6.345440787576031e-05, + "loss": 0.8615, + "step": 7965 + }, + { + "epoch": 9.857761286332714, + "grad_norm": 0.4453125, + "learning_rate": 6.332040270695219e-05, + "loss": 0.8724, + "step": 7970 + }, + { + "epoch": 9.863945578231293, + "grad_norm": 0.44921875, + "learning_rate": 6.31864736012367e-05, + "loss": 0.8641, + "step": 7975 + }, + { + "epoch": 9.87012987012987, + "grad_norm": 0.451171875, + "learning_rate": 6.305262083634488e-05, + "loss": 0.8693, + "step": 7980 + }, + { + "epoch": 9.876314162028448, + "grad_norm": 0.43359375, + "learning_rate": 6.291884468984941e-05, + "loss": 0.8602, + "step": 7985 + }, + { + "epoch": 9.882498453927026, + "grad_norm": 0.443359375, + "learning_rate": 6.278514543916415e-05, + "loss": 0.8574, + "step": 7990 + }, + { + "epoch": 9.888682745825603, + "grad_norm": 0.431640625, + "learning_rate": 6.265152336154345e-05, + "loss": 0.8529, + "step": 7995 + }, + { + "epoch": 9.89486703772418, + "grad_norm": 0.482421875, + "learning_rate": 6.251797873408161e-05, + "loss": 0.8701, + "step": 8000 + }, + { + "epoch": 9.901051329622758, + "grad_norm": 0.45703125, + "learning_rate": 6.238451183371241e-05, + "loss": 0.8587, + "step": 8005 + }, + { + "epoch": 9.907235621521336, + "grad_norm": 0.44140625, + "learning_rate": 6.225112293720836e-05, + "loss": 0.858, + "step": 8010 + }, + { + "epoch": 9.913419913419913, + "grad_norm": 0.466796875, + "learning_rate": 6.211781232118025e-05, + "loss": 0.8736, + "step": 8015 + }, + { + "epoch": 9.91960420531849, + "grad_norm": 0.4453125, + "learning_rate": 6.198458026207652e-05, + "loss": 0.8559, + "step": 8020 + }, + { + "epoch": 9.925788497217068, + "grad_norm": 0.458984375, + "learning_rate": 6.18514270361827e-05, + "loss": 0.8657, + "step": 8025 + }, + { + "epoch": 9.931972789115646, + "grad_norm": 0.431640625, + "learning_rate": 6.171835291962088e-05, + "loss": 0.8569, + "step": 8030 + }, + { + "epoch": 9.938157081014223, + "grad_norm": 0.431640625, + "learning_rate": 6.158535818834906e-05, + "loss": 0.8585, + "step": 8035 + }, + { + "epoch": 9.9443413729128, + "grad_norm": 0.451171875, + "learning_rate": 6.145244311816063e-05, + "loss": 0.863, + "step": 8040 + }, + { + "epoch": 9.95052566481138, + "grad_norm": 0.46484375, + "learning_rate": 6.13196079846838e-05, + "loss": 0.8619, + "step": 8045 + }, + { + "epoch": 9.956709956709958, + "grad_norm": 0.4375, + "learning_rate": 6.1186853063381e-05, + "loss": 0.8664, + "step": 8050 + }, + { + "epoch": 9.962894248608535, + "grad_norm": 0.43359375, + "learning_rate": 6.105417862954828e-05, + "loss": 0.8636, + "step": 8055 + }, + { + "epoch": 9.969078540507113, + "grad_norm": 0.431640625, + "learning_rate": 6.092158495831486e-05, + "loss": 0.8536, + "step": 8060 + }, + { + "epoch": 9.97526283240569, + "grad_norm": 0.44921875, + "learning_rate": 6.078907232464248e-05, + "loss": 0.8701, + "step": 8065 + }, + { + "epoch": 9.981447124304268, + "grad_norm": 0.421875, + "learning_rate": 6.065664100332478e-05, + "loss": 0.8686, + "step": 8070 + }, + { + "epoch": 9.987631416202845, + "grad_norm": 0.421875, + "learning_rate": 6.0524291268986766e-05, + "loss": 0.8626, + "step": 8075 + }, + { + "epoch": 9.993815708101423, + "grad_norm": 0.419921875, + "learning_rate": 6.039202339608432e-05, + "loss": 0.8695, + "step": 8080 + }, + { + "epoch": 10.0, + "grad_norm": 0.462890625, + "learning_rate": 6.025983765890353e-05, + "loss": 0.8421, + "step": 8085 + }, + { + "epoch": 10.0, + "eval_loss": 2.518721103668213, + "eval_runtime": 0.5383, + "eval_samples_per_second": 18.576, + "eval_steps_per_second": 1.858, + "step": 8085 + }, + { + "epoch": 10.006184291898577, + "grad_norm": 0.443359375, + "learning_rate": 6.012773433156017e-05, + "loss": 0.8602, + "step": 8090 + }, + { + "epoch": 10.012368583797155, + "grad_norm": 0.46484375, + "learning_rate": 5.99957136879991e-05, + "loss": 0.8548, + "step": 8095 + }, + { + "epoch": 10.018552875695732, + "grad_norm": 0.451171875, + "learning_rate": 5.986377600199371e-05, + "loss": 0.8597, + "step": 8100 + }, + { + "epoch": 10.02473716759431, + "grad_norm": 0.423828125, + "learning_rate": 5.973192154714547e-05, + "loss": 0.8643, + "step": 8105 + }, + { + "epoch": 10.030921459492887, + "grad_norm": 0.4453125, + "learning_rate": 5.9600150596883066e-05, + "loss": 0.8591, + "step": 8110 + }, + { + "epoch": 10.037105751391465, + "grad_norm": 0.462890625, + "learning_rate": 5.946846342446214e-05, + "loss": 0.8699, + "step": 8115 + }, + { + "epoch": 10.043290043290042, + "grad_norm": 0.427734375, + "learning_rate": 5.933686030296459e-05, + "loss": 0.8558, + "step": 8120 + }, + { + "epoch": 10.049474335188622, + "grad_norm": 0.439453125, + "learning_rate": 5.920534150529797e-05, + "loss": 0.8567, + "step": 8125 + }, + { + "epoch": 10.0556586270872, + "grad_norm": 0.46875, + "learning_rate": 5.907390730419507e-05, + "loss": 0.8566, + "step": 8130 + }, + { + "epoch": 10.061842918985777, + "grad_norm": 0.427734375, + "learning_rate": 5.894255797221313e-05, + "loss": 0.8462, + "step": 8135 + }, + { + "epoch": 10.068027210884354, + "grad_norm": 0.4375, + "learning_rate": 5.881129378173347e-05, + "loss": 0.8521, + "step": 8140 + }, + { + "epoch": 10.074211502782932, + "grad_norm": 0.490234375, + "learning_rate": 5.868011500496084e-05, + "loss": 0.858, + "step": 8145 + }, + { + "epoch": 10.08039579468151, + "grad_norm": 0.453125, + "learning_rate": 5.854902191392284e-05, + "loss": 0.8568, + "step": 8150 + }, + { + "epoch": 10.086580086580087, + "grad_norm": 0.474609375, + "learning_rate": 5.84180147804694e-05, + "loss": 0.864, + "step": 8155 + }, + { + "epoch": 10.092764378478664, + "grad_norm": 0.474609375, + "learning_rate": 5.828709387627218e-05, + "loss": 0.8633, + "step": 8160 + }, + { + "epoch": 10.098948670377242, + "grad_norm": 0.44921875, + "learning_rate": 5.8156259472824124e-05, + "loss": 0.8608, + "step": 8165 + }, + { + "epoch": 10.10513296227582, + "grad_norm": 0.447265625, + "learning_rate": 5.802551184143865e-05, + "loss": 0.8512, + "step": 8170 + }, + { + "epoch": 10.111317254174397, + "grad_norm": 0.44921875, + "learning_rate": 5.789485125324926e-05, + "loss": 0.8572, + "step": 8175 + }, + { + "epoch": 10.117501546072974, + "grad_norm": 0.4453125, + "learning_rate": 5.7764277979209094e-05, + "loss": 0.8498, + "step": 8180 + }, + { + "epoch": 10.123685837971552, + "grad_norm": 0.4453125, + "learning_rate": 5.763379229009003e-05, + "loss": 0.8596, + "step": 8185 + }, + { + "epoch": 10.12987012987013, + "grad_norm": 0.427734375, + "learning_rate": 5.750339445648252e-05, + "loss": 0.8667, + "step": 8190 + }, + { + "epoch": 10.136054421768707, + "grad_norm": 0.453125, + "learning_rate": 5.7373084748794626e-05, + "loss": 0.8661, + "step": 8195 + }, + { + "epoch": 10.142238713667284, + "grad_norm": 0.451171875, + "learning_rate": 5.724286343725185e-05, + "loss": 0.8522, + "step": 8200 + }, + { + "epoch": 10.148423005565864, + "grad_norm": 0.4375, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.863, + "step": 8205 + }, + { + "epoch": 10.154607297464441, + "grad_norm": 0.43359375, + "learning_rate": 5.6982687082585994e-05, + "loss": 0.8652, + "step": 8210 + }, + { + "epoch": 10.160791589363019, + "grad_norm": 0.43359375, + "learning_rate": 5.685273257899505e-05, + "loss": 0.8557, + "step": 8215 + }, + { + "epoch": 10.166975881261596, + "grad_norm": 0.431640625, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.8567, + "step": 8220 + }, + { + "epoch": 10.173160173160174, + "grad_norm": 0.45703125, + "learning_rate": 5.6593092266740545e-05, + "loss": 0.8569, + "step": 8225 + }, + { + "epoch": 10.179344465058751, + "grad_norm": 0.44921875, + "learning_rate": 5.6463406996497456e-05, + "loss": 0.8591, + "step": 8230 + }, + { + "epoch": 10.185528756957329, + "grad_norm": 0.4453125, + "learning_rate": 5.633381200881335e-05, + "loss": 0.8581, + "step": 8235 + }, + { + "epoch": 10.191713048855906, + "grad_norm": 0.427734375, + "learning_rate": 5.620430757243156e-05, + "loss": 0.8579, + "step": 8240 + }, + { + "epoch": 10.197897340754484, + "grad_norm": 0.458984375, + "learning_rate": 5.6074893955907535e-05, + "loss": 0.8621, + "step": 8245 + }, + { + "epoch": 10.204081632653061, + "grad_norm": 0.4140625, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.8627, + "step": 8250 + }, + { + "epoch": 10.210265924551639, + "grad_norm": 0.421875, + "learning_rate": 5.581634025571274e-05, + "loss": 0.8577, + "step": 8255 + }, + { + "epoch": 10.216450216450216, + "grad_norm": 0.45703125, + "learning_rate": 5.5687200708209076e-05, + "loss": 0.8604, + "step": 8260 + }, + { + "epoch": 10.222634508348794, + "grad_norm": 0.451171875, + "learning_rate": 5.555815305289631e-05, + "loss": 0.8684, + "step": 8265 + }, + { + "epoch": 10.228818800247371, + "grad_norm": 0.44140625, + "learning_rate": 5.542919755738275e-05, + "loss": 0.8599, + "step": 8270 + }, + { + "epoch": 10.235003092145949, + "grad_norm": 0.515625, + "learning_rate": 5.5300334489085595e-05, + "loss": 0.8535, + "step": 8275 + }, + { + "epoch": 10.241187384044528, + "grad_norm": 0.421875, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.8628, + "step": 8280 + }, + { + "epoch": 10.247371675943105, + "grad_norm": 0.42578125, + "learning_rate": 5.504288670285008e-05, + "loss": 0.8571, + "step": 8285 + }, + { + "epoch": 10.253555967841683, + "grad_norm": 0.43359375, + "learning_rate": 5.491430251878551e-05, + "loss": 0.8539, + "step": 8290 + }, + { + "epoch": 10.25974025974026, + "grad_norm": 0.443359375, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.8602, + "step": 8295 + }, + { + "epoch": 10.265924551638838, + "grad_norm": 0.44921875, + "learning_rate": 5.4657414901998095e-05, + "loss": 0.8619, + "step": 8300 + }, + { + "epoch": 10.272108843537415, + "grad_norm": 0.427734375, + "learning_rate": 5.4529112001987314e-05, + "loss": 0.8605, + "step": 8305 + }, + { + "epoch": 10.278293135435993, + "grad_norm": 0.416015625, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.8549, + "step": 8310 + }, + { + "epoch": 10.28447742733457, + "grad_norm": 0.486328125, + "learning_rate": 5.427278934905049e-05, + "loss": 0.8649, + "step": 8315 + }, + { + "epoch": 10.290661719233148, + "grad_norm": 0.451171875, + "learning_rate": 5.4144770127665024e-05, + "loss": 0.8582, + "step": 8320 + }, + { + "epoch": 10.296846011131725, + "grad_norm": 0.4140625, + "learning_rate": 5.401684599703445e-05, + "loss": 0.8613, + "step": 8325 + }, + { + "epoch": 10.303030303030303, + "grad_norm": 0.44140625, + "learning_rate": 5.388901722243724e-05, + "loss": 0.8498, + "step": 8330 + }, + { + "epoch": 10.30921459492888, + "grad_norm": 0.43359375, + "learning_rate": 5.376128406895408e-05, + "loss": 0.8547, + "step": 8335 + }, + { + "epoch": 10.315398886827458, + "grad_norm": 0.43359375, + "learning_rate": 5.363364680146725e-05, + "loss": 0.8501, + "step": 8340 + }, + { + "epoch": 10.321583178726035, + "grad_norm": 0.4453125, + "learning_rate": 5.350610568466039e-05, + "loss": 0.8616, + "step": 8345 + }, + { + "epoch": 10.327767470624613, + "grad_norm": 0.44140625, + "learning_rate": 5.3378660983017536e-05, + "loss": 0.8573, + "step": 8350 + }, + { + "epoch": 10.33395176252319, + "grad_norm": 0.4375, + "learning_rate": 5.325131296082298e-05, + "loss": 0.8596, + "step": 8355 + }, + { + "epoch": 10.34013605442177, + "grad_norm": 0.423828125, + "learning_rate": 5.31240618821604e-05, + "loss": 0.8661, + "step": 8360 + }, + { + "epoch": 10.346320346320347, + "grad_norm": 0.431640625, + "learning_rate": 5.2996908010912437e-05, + "loss": 0.8436, + "step": 8365 + }, + { + "epoch": 10.352504638218925, + "grad_norm": 0.4296875, + "learning_rate": 5.286985161076029e-05, + "loss": 0.8591, + "step": 8370 + }, + { + "epoch": 10.358688930117502, + "grad_norm": 0.4375, + "learning_rate": 5.274289294518283e-05, + "loss": 0.8619, + "step": 8375 + }, + { + "epoch": 10.36487322201608, + "grad_norm": 0.45703125, + "learning_rate": 5.2616032277456463e-05, + "loss": 0.8588, + "step": 8380 + }, + { + "epoch": 10.371057513914657, + "grad_norm": 0.4375, + "learning_rate": 5.248926987065417e-05, + "loss": 0.8616, + "step": 8385 + }, + { + "epoch": 10.377241805813235, + "grad_norm": 0.435546875, + "learning_rate": 5.236260598764535e-05, + "loss": 0.8631, + "step": 8390 + }, + { + "epoch": 10.383426097711812, + "grad_norm": 0.44140625, + "learning_rate": 5.223604089109495e-05, + "loss": 0.8628, + "step": 8395 + }, + { + "epoch": 10.38961038961039, + "grad_norm": 0.453125, + "learning_rate": 5.210957484346314e-05, + "loss": 0.8553, + "step": 8400 + }, + { + "epoch": 10.395794681508967, + "grad_norm": 0.41796875, + "learning_rate": 5.198320810700472e-05, + "loss": 0.8551, + "step": 8405 + }, + { + "epoch": 10.401978973407545, + "grad_norm": 0.42578125, + "learning_rate": 5.185694094376843e-05, + "loss": 0.8565, + "step": 8410 + }, + { + "epoch": 10.408163265306122, + "grad_norm": 0.439453125, + "learning_rate": 5.173077361559665e-05, + "loss": 0.8714, + "step": 8415 + }, + { + "epoch": 10.4143475572047, + "grad_norm": 0.4609375, + "learning_rate": 5.160470638412461e-05, + "loss": 0.8585, + "step": 8420 + }, + { + "epoch": 10.420531849103277, + "grad_norm": 0.431640625, + "learning_rate": 5.1478739510780104e-05, + "loss": 0.8488, + "step": 8425 + }, + { + "epoch": 10.426716141001855, + "grad_norm": 0.462890625, + "learning_rate": 5.135287325678271e-05, + "loss": 0.8691, + "step": 8430 + }, + { + "epoch": 10.432900432900432, + "grad_norm": 0.4453125, + "learning_rate": 5.122710788314331e-05, + "loss": 0.8578, + "step": 8435 + }, + { + "epoch": 10.439084724799011, + "grad_norm": 0.44921875, + "learning_rate": 5.1101443650663764e-05, + "loss": 0.8591, + "step": 8440 + }, + { + "epoch": 10.445269016697589, + "grad_norm": 0.4296875, + "learning_rate": 5.0975880819936004e-05, + "loss": 0.8588, + "step": 8445 + }, + { + "epoch": 10.451453308596166, + "grad_norm": 0.4453125, + "learning_rate": 5.085041965134183e-05, + "loss": 0.8542, + "step": 8450 + }, + { + "epoch": 10.457637600494744, + "grad_norm": 0.41796875, + "learning_rate": 5.072506040505208e-05, + "loss": 0.8595, + "step": 8455 + }, + { + "epoch": 10.463821892393321, + "grad_norm": 0.44921875, + "learning_rate": 5.059980334102637e-05, + "loss": 0.8456, + "step": 8460 + }, + { + "epoch": 10.470006184291899, + "grad_norm": 0.439453125, + "learning_rate": 5.04746487190124e-05, + "loss": 0.855, + "step": 8465 + }, + { + "epoch": 10.476190476190476, + "grad_norm": 0.435546875, + "learning_rate": 5.034959679854532e-05, + "loss": 0.8573, + "step": 8470 + }, + { + "epoch": 10.482374768089054, + "grad_norm": 0.4375, + "learning_rate": 5.022464783894744e-05, + "loss": 0.8549, + "step": 8475 + }, + { + "epoch": 10.488559059987631, + "grad_norm": 0.46484375, + "learning_rate": 5.009980209932743e-05, + "loss": 0.8708, + "step": 8480 + }, + { + "epoch": 10.494743351886209, + "grad_norm": 0.453125, + "learning_rate": 4.9975059838580083e-05, + "loss": 0.8615, + "step": 8485 + }, + { + "epoch": 10.500927643784786, + "grad_norm": 0.421875, + "learning_rate": 4.985042131538545e-05, + "loss": 0.8629, + "step": 8490 + }, + { + "epoch": 10.507111935683364, + "grad_norm": 0.455078125, + "learning_rate": 4.9725886788208474e-05, + "loss": 0.8657, + "step": 8495 + }, + { + "epoch": 10.513296227581941, + "grad_norm": 0.439453125, + "learning_rate": 4.960145651529856e-05, + "loss": 0.8473, + "step": 8500 + }, + { + "epoch": 10.519480519480519, + "grad_norm": 0.419921875, + "learning_rate": 4.9477130754688775e-05, + "loss": 0.8669, + "step": 8505 + }, + { + "epoch": 10.525664811379096, + "grad_norm": 0.435546875, + "learning_rate": 4.9352909764195576e-05, + "loss": 0.8594, + "step": 8510 + }, + { + "epoch": 10.531849103277676, + "grad_norm": 0.46484375, + "learning_rate": 4.922879380141805e-05, + "loss": 0.8501, + "step": 8515 + }, + { + "epoch": 10.538033395176253, + "grad_norm": 0.466796875, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.8597, + "step": 8520 + }, + { + "epoch": 10.54421768707483, + "grad_norm": 0.431640625, + "learning_rate": 4.898087798831716e-05, + "loss": 0.8558, + "step": 8525 + }, + { + "epoch": 10.550401978973408, + "grad_norm": 0.4375, + "learning_rate": 4.885707865210093e-05, + "loss": 0.8599, + "step": 8530 + }, + { + "epoch": 10.556586270871986, + "grad_norm": 0.427734375, + "learning_rate": 4.873338537181368e-05, + "loss": 0.8601, + "step": 8535 + }, + { + "epoch": 10.562770562770563, + "grad_norm": 0.44921875, + "learning_rate": 4.860979840396016e-05, + "loss": 0.8651, + "step": 8540 + }, + { + "epoch": 10.56895485466914, + "grad_norm": 0.44140625, + "learning_rate": 4.8486318004824794e-05, + "loss": 0.8599, + "step": 8545 + }, + { + "epoch": 10.575139146567718, + "grad_norm": 0.453125, + "learning_rate": 4.836294443047088e-05, + "loss": 0.8638, + "step": 8550 + }, + { + "epoch": 10.581323438466296, + "grad_norm": 0.435546875, + "learning_rate": 4.823967793674033e-05, + "loss": 0.8507, + "step": 8555 + }, + { + "epoch": 10.587507730364873, + "grad_norm": 0.4453125, + "learning_rate": 4.8116518779252885e-05, + "loss": 0.8627, + "step": 8560 + }, + { + "epoch": 10.59369202226345, + "grad_norm": 0.427734375, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.8496, + "step": 8565 + }, + { + "epoch": 10.599876314162028, + "grad_norm": 0.421875, + "learning_rate": 4.787052349437295e-05, + "loss": 0.8644, + "step": 8570 + }, + { + "epoch": 10.606060606060606, + "grad_norm": 0.45703125, + "learning_rate": 4.774768787710501e-05, + "loss": 0.8613, + "step": 8575 + }, + { + "epoch": 10.612244897959183, + "grad_norm": 0.42578125, + "learning_rate": 4.762496061632814e-05, + "loss": 0.8637, + "step": 8580 + }, + { + "epoch": 10.61842918985776, + "grad_norm": 0.44140625, + "learning_rate": 4.7502341966544e-05, + "loss": 0.8614, + "step": 8585 + }, + { + "epoch": 10.624613481756338, + "grad_norm": 0.44140625, + "learning_rate": 4.7379832182028814e-05, + "loss": 0.8562, + "step": 8590 + }, + { + "epoch": 10.630797773654917, + "grad_norm": 0.4375, + "learning_rate": 4.725743151683325e-05, + "loss": 0.8573, + "step": 8595 + }, + { + "epoch": 10.636982065553495, + "grad_norm": 0.43359375, + "learning_rate": 4.713514022478155e-05, + "loss": 0.8611, + "step": 8600 + }, + { + "epoch": 10.643166357452072, + "grad_norm": 0.42578125, + "learning_rate": 4.701295855947126e-05, + "loss": 0.8604, + "step": 8605 + }, + { + "epoch": 10.64935064935065, + "grad_norm": 0.423828125, + "learning_rate": 4.689088677427249e-05, + "loss": 0.8561, + "step": 8610 + }, + { + "epoch": 10.655534941249227, + "grad_norm": 0.46484375, + "learning_rate": 4.676892512232758e-05, + "loss": 0.8619, + "step": 8615 + }, + { + "epoch": 10.661719233147805, + "grad_norm": 0.42578125, + "learning_rate": 4.6647073856550415e-05, + "loss": 0.8636, + "step": 8620 + }, + { + "epoch": 10.667903525046382, + "grad_norm": 0.48046875, + "learning_rate": 4.652533322962597e-05, + "loss": 0.8545, + "step": 8625 + }, + { + "epoch": 10.67408781694496, + "grad_norm": 0.484375, + "learning_rate": 4.6403703494009875e-05, + "loss": 0.8494, + "step": 8630 + }, + { + "epoch": 10.680272108843537, + "grad_norm": 0.44921875, + "learning_rate": 4.6282184901927674e-05, + "loss": 0.8562, + "step": 8635 + }, + { + "epoch": 10.686456400742115, + "grad_norm": 0.427734375, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.8659, + "step": 8640 + }, + { + "epoch": 10.692640692640692, + "grad_norm": 0.46484375, + "learning_rate": 4.603948215611461e-05, + "loss": 0.8543, + "step": 8645 + }, + { + "epoch": 10.69882498453927, + "grad_norm": 0.44140625, + "learning_rate": 4.591829850568046e-05, + "loss": 0.8608, + "step": 8650 + }, + { + "epoch": 10.705009276437847, + "grad_norm": 0.427734375, + "learning_rate": 4.579722700537268e-05, + "loss": 0.8566, + "step": 8655 + }, + { + "epoch": 10.711193568336425, + "grad_norm": 0.4296875, + "learning_rate": 4.567626790625921e-05, + "loss": 0.8532, + "step": 8660 + }, + { + "epoch": 10.717377860235002, + "grad_norm": 0.427734375, + "learning_rate": 4.555542145917501e-05, + "loss": 0.8549, + "step": 8665 + }, + { + "epoch": 10.723562152133582, + "grad_norm": 0.453125, + "learning_rate": 4.543468791472131e-05, + "loss": 0.8586, + "step": 8670 + }, + { + "epoch": 10.729746444032159, + "grad_norm": 0.443359375, + "learning_rate": 4.5314067523265333e-05, + "loss": 0.8684, + "step": 8675 + }, + { + "epoch": 10.735930735930737, + "grad_norm": 0.42578125, + "learning_rate": 4.519356053493958e-05, + "loss": 0.8558, + "step": 8680 + }, + { + "epoch": 10.742115027829314, + "grad_norm": 0.412109375, + "learning_rate": 4.5073167199641367e-05, + "loss": 0.8604, + "step": 8685 + }, + { + "epoch": 10.748299319727892, + "grad_norm": 0.41796875, + "learning_rate": 4.495288776703241e-05, + "loss": 0.855, + "step": 8690 + }, + { + "epoch": 10.754483611626469, + "grad_norm": 0.439453125, + "learning_rate": 4.483272248653811e-05, + "loss": 0.8472, + "step": 8695 + }, + { + "epoch": 10.760667903525047, + "grad_norm": 0.44140625, + "learning_rate": 4.471267160734731e-05, + "loss": 0.8542, + "step": 8700 + }, + { + "epoch": 10.766852195423624, + "grad_norm": 0.4453125, + "learning_rate": 4.459273537841141e-05, + "loss": 0.8558, + "step": 8705 + }, + { + "epoch": 10.773036487322202, + "grad_norm": 0.44921875, + "learning_rate": 4.447291404844424e-05, + "loss": 0.849, + "step": 8710 + }, + { + "epoch": 10.779220779220779, + "grad_norm": 0.4296875, + "learning_rate": 4.43532078659213e-05, + "loss": 0.8582, + "step": 8715 + }, + { + "epoch": 10.785405071119357, + "grad_norm": 0.451171875, + "learning_rate": 4.4233617079079236e-05, + "loss": 0.8583, + "step": 8720 + }, + { + "epoch": 10.791589363017934, + "grad_norm": 0.466796875, + "learning_rate": 4.4114141935915534e-05, + "loss": 0.8683, + "step": 8725 + }, + { + "epoch": 10.797773654916512, + "grad_norm": 0.43359375, + "learning_rate": 4.399478268418771e-05, + "loss": 0.87, + "step": 8730 + }, + { + "epoch": 10.803957946815089, + "grad_norm": 0.4375, + "learning_rate": 4.3875539571413106e-05, + "loss": 0.8518, + "step": 8735 + }, + { + "epoch": 10.810142238713667, + "grad_norm": 0.4296875, + "learning_rate": 4.375641284486808e-05, + "loss": 0.8577, + "step": 8740 + }, + { + "epoch": 10.816326530612244, + "grad_norm": 0.4921875, + "learning_rate": 4.36374027515878e-05, + "loss": 0.8692, + "step": 8745 + }, + { + "epoch": 10.822510822510823, + "grad_norm": 0.451171875, + "learning_rate": 4.3518509538365425e-05, + "loss": 0.866, + "step": 8750 + }, + { + "epoch": 10.8286951144094, + "grad_norm": 0.462890625, + "learning_rate": 4.3399733451751776e-05, + "loss": 0.859, + "step": 8755 + }, + { + "epoch": 10.834879406307978, + "grad_norm": 0.458984375, + "learning_rate": 4.328107473805487e-05, + "loss": 0.8584, + "step": 8760 + }, + { + "epoch": 10.841063698206556, + "grad_norm": 0.478515625, + "learning_rate": 4.3162533643339185e-05, + "loss": 0.8585, + "step": 8765 + }, + { + "epoch": 10.847247990105133, + "grad_norm": 0.443359375, + "learning_rate": 4.3044110413425395e-05, + "loss": 0.867, + "step": 8770 + }, + { + "epoch": 10.85343228200371, + "grad_norm": 0.4375, + "learning_rate": 4.2925805293889786e-05, + "loss": 0.854, + "step": 8775 + }, + { + "epoch": 10.859616573902288, + "grad_norm": 0.42578125, + "learning_rate": 4.2807618530063565e-05, + "loss": 0.8525, + "step": 8780 + }, + { + "epoch": 10.865800865800866, + "grad_norm": 0.43359375, + "learning_rate": 4.268955036703267e-05, + "loss": 0.8495, + "step": 8785 + }, + { + "epoch": 10.871985157699443, + "grad_norm": 0.443359375, + "learning_rate": 4.257160104963696e-05, + "loss": 0.863, + "step": 8790 + }, + { + "epoch": 10.87816944959802, + "grad_norm": 0.4296875, + "learning_rate": 4.245377082246995e-05, + "loss": 0.8614, + "step": 8795 + }, + { + "epoch": 10.884353741496598, + "grad_norm": 0.4453125, + "learning_rate": 4.23360599298781e-05, + "loss": 0.8674, + "step": 8800 + }, + { + "epoch": 10.890538033395176, + "grad_norm": 0.435546875, + "learning_rate": 4.2218468615960484e-05, + "loss": 0.8588, + "step": 8805 + }, + { + "epoch": 10.896722325293753, + "grad_norm": 0.4609375, + "learning_rate": 4.210099712456822e-05, + "loss": 0.8665, + "step": 8810 + }, + { + "epoch": 10.90290661719233, + "grad_norm": 0.451171875, + "learning_rate": 4.1983645699303786e-05, + "loss": 0.8527, + "step": 8815 + }, + { + "epoch": 10.909090909090908, + "grad_norm": 0.431640625, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.8574, + "step": 8820 + }, + { + "epoch": 10.915275200989488, + "grad_norm": 0.439453125, + "learning_rate": 4.174930402032354e-05, + "loss": 0.8598, + "step": 8825 + }, + { + "epoch": 10.921459492888065, + "grad_norm": 0.458984375, + "learning_rate": 4.163231425256595e-05, + "loss": 0.8656, + "step": 8830 + }, + { + "epoch": 10.927643784786643, + "grad_norm": 0.455078125, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.8586, + "step": 8835 + }, + { + "epoch": 10.93382807668522, + "grad_norm": 0.453125, + "learning_rate": 4.139869807353357e-05, + "loss": 0.8615, + "step": 8840 + }, + { + "epoch": 10.940012368583798, + "grad_norm": 0.447265625, + "learning_rate": 4.128207214671255e-05, + "loss": 0.8655, + "step": 8845 + }, + { + "epoch": 10.946196660482375, + "grad_norm": 0.435546875, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.8549, + "step": 8850 + }, + { + "epoch": 10.952380952380953, + "grad_norm": 0.443359375, + "learning_rate": 4.1049185827705904e-05, + "loss": 0.859, + "step": 8855 + }, + { + "epoch": 10.95856524427953, + "grad_norm": 0.4375, + "learning_rate": 4.0932925918460516e-05, + "loss": 0.8618, + "step": 8860 + }, + { + "epoch": 10.964749536178108, + "grad_norm": 0.453125, + "learning_rate": 4.081678849759181e-05, + "loss": 0.8544, + "step": 8865 + }, + { + "epoch": 10.970933828076685, + "grad_norm": 0.42578125, + "learning_rate": 4.070077380593579e-05, + "loss": 0.8579, + "step": 8870 + }, + { + "epoch": 10.977118119975263, + "grad_norm": 0.439453125, + "learning_rate": 4.058488208407415e-05, + "loss": 0.8697, + "step": 8875 + }, + { + "epoch": 10.98330241187384, + "grad_norm": 0.423828125, + "learning_rate": 4.046911357233343e-05, + "loss": 0.8579, + "step": 8880 + }, + { + "epoch": 10.989486703772418, + "grad_norm": 0.439453125, + "learning_rate": 4.035346851078471e-05, + "loss": 0.8545, + "step": 8885 + }, + { + "epoch": 10.995670995670995, + "grad_norm": 0.42578125, + "learning_rate": 4.02379471392431e-05, + "loss": 0.8555, + "step": 8890 + }, + { + "epoch": 10.999381570810142, + "eval_loss": 2.519637107849121, + "eval_runtime": 0.6376, + "eval_samples_per_second": 15.684, + "eval_steps_per_second": 1.568, + "step": 8893 + }, + { + "epoch": 11.001855287569573, + "grad_norm": 0.47265625, + "learning_rate": 4.0122549697267244e-05, + "loss": 0.8561, + "step": 8895 + }, + { + "epoch": 11.00803957946815, + "grad_norm": 0.41796875, + "learning_rate": 4.000727642415867e-05, + "loss": 0.8568, + "step": 8900 + }, + { + "epoch": 11.01422387136673, + "grad_norm": 0.470703125, + "learning_rate": 3.9892127558961546e-05, + "loss": 0.8552, + "step": 8905 + }, + { + "epoch": 11.020408163265307, + "grad_norm": 0.44921875, + "learning_rate": 3.977710334046193e-05, + "loss": 0.8449, + "step": 8910 + }, + { + "epoch": 11.026592455163884, + "grad_norm": 0.453125, + "learning_rate": 3.9662204007187534e-05, + "loss": 0.8568, + "step": 8915 + }, + { + "epoch": 11.032776747062462, + "grad_norm": 0.44921875, + "learning_rate": 3.954742979740695e-05, + "loss": 0.8545, + "step": 8920 + }, + { + "epoch": 11.03896103896104, + "grad_norm": 0.427734375, + "learning_rate": 3.943278094912946e-05, + "loss": 0.8591, + "step": 8925 + }, + { + "epoch": 11.045145330859617, + "grad_norm": 0.443359375, + "learning_rate": 3.9318257700104174e-05, + "loss": 0.8521, + "step": 8930 + }, + { + "epoch": 11.051329622758194, + "grad_norm": 0.43359375, + "learning_rate": 3.920386028781995e-05, + "loss": 0.8529, + "step": 8935 + }, + { + "epoch": 11.057513914656772, + "grad_norm": 0.466796875, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.8505, + "step": 8940 + }, + { + "epoch": 11.06369820655535, + "grad_norm": 0.443359375, + "learning_rate": 3.897544392212453e-05, + "loss": 0.8584, + "step": 8945 + }, + { + "epoch": 11.069882498453927, + "grad_norm": 0.443359375, + "learning_rate": 3.8861425442384135e-05, + "loss": 0.8614, + "step": 8950 + }, + { + "epoch": 11.076066790352504, + "grad_norm": 0.4375, + "learning_rate": 3.874753374672542e-05, + "loss": 0.8564, + "step": 8955 + }, + { + "epoch": 11.082251082251082, + "grad_norm": 0.4375, + "learning_rate": 3.863376907132752e-05, + "loss": 0.8547, + "step": 8960 + }, + { + "epoch": 11.08843537414966, + "grad_norm": 0.431640625, + "learning_rate": 3.8520131652106186e-05, + "loss": 0.8568, + "step": 8965 + }, + { + "epoch": 11.094619666048237, + "grad_norm": 0.431640625, + "learning_rate": 3.840662172471315e-05, + "loss": 0.8578, + "step": 8970 + }, + { + "epoch": 11.100803957946814, + "grad_norm": 0.43359375, + "learning_rate": 3.8293239524535915e-05, + "loss": 0.8555, + "step": 8975 + }, + { + "epoch": 11.106988249845394, + "grad_norm": 0.4453125, + "learning_rate": 3.8179985286696986e-05, + "loss": 0.8503, + "step": 8980 + }, + { + "epoch": 11.113172541743971, + "grad_norm": 0.439453125, + "learning_rate": 3.806685924605361e-05, + "loss": 0.8591, + "step": 8985 + }, + { + "epoch": 11.119356833642549, + "grad_norm": 0.439453125, + "learning_rate": 3.7953861637197085e-05, + "loss": 0.8569, + "step": 8990 + }, + { + "epoch": 11.125541125541126, + "grad_norm": 0.431640625, + "learning_rate": 3.784099269445247e-05, + "loss": 0.8432, + "step": 8995 + }, + { + "epoch": 11.131725417439704, + "grad_norm": 0.435546875, + "learning_rate": 3.772825265187802e-05, + "loss": 0.8495, + "step": 9000 + }, + { + "epoch": 11.137909709338281, + "grad_norm": 0.419921875, + "learning_rate": 3.7615641743264586e-05, + "loss": 0.8622, + "step": 9005 + }, + { + "epoch": 11.144094001236859, + "grad_norm": 0.4296875, + "learning_rate": 3.75031602021353e-05, + "loss": 0.8522, + "step": 9010 + }, + { + "epoch": 11.150278293135436, + "grad_norm": 0.40625, + "learning_rate": 3.739080826174498e-05, + "loss": 0.863, + "step": 9015 + }, + { + "epoch": 11.156462585034014, + "grad_norm": 0.423828125, + "learning_rate": 3.727858615507974e-05, + "loss": 0.8574, + "step": 9020 + }, + { + "epoch": 11.162646876932591, + "grad_norm": 0.42578125, + "learning_rate": 3.716649411485649e-05, + "loss": 0.8516, + "step": 9025 + }, + { + "epoch": 11.168831168831169, + "grad_norm": 0.4296875, + "learning_rate": 3.705453237352227e-05, + "loss": 0.8488, + "step": 9030 + }, + { + "epoch": 11.175015460729746, + "grad_norm": 0.4453125, + "learning_rate": 3.694270116325409e-05, + "loss": 0.8661, + "step": 9035 + }, + { + "epoch": 11.181199752628324, + "grad_norm": 0.439453125, + "learning_rate": 3.683100071595813e-05, + "loss": 0.8578, + "step": 9040 + }, + { + "epoch": 11.187384044526901, + "grad_norm": 0.416015625, + "learning_rate": 3.6719431263269533e-05, + "loss": 0.8548, + "step": 9045 + }, + { + "epoch": 11.193568336425479, + "grad_norm": 0.427734375, + "learning_rate": 3.660799303655166e-05, + "loss": 0.8551, + "step": 9050 + }, + { + "epoch": 11.199752628324056, + "grad_norm": 0.44140625, + "learning_rate": 3.6496686266895874e-05, + "loss": 0.8744, + "step": 9055 + }, + { + "epoch": 11.205936920222635, + "grad_norm": 0.421875, + "learning_rate": 3.638551118512089e-05, + "loss": 0.8612, + "step": 9060 + }, + { + "epoch": 11.212121212121213, + "grad_norm": 0.421875, + "learning_rate": 3.6274468021772323e-05, + "loss": 0.8442, + "step": 9065 + }, + { + "epoch": 11.21830550401979, + "grad_norm": 0.435546875, + "learning_rate": 3.616355700712221e-05, + "loss": 0.8621, + "step": 9070 + }, + { + "epoch": 11.224489795918368, + "grad_norm": 0.419921875, + "learning_rate": 3.605277837116854e-05, + "loss": 0.8561, + "step": 9075 + }, + { + "epoch": 11.230674087816945, + "grad_norm": 0.439453125, + "learning_rate": 3.594213234363486e-05, + "loss": 0.8508, + "step": 9080 + }, + { + "epoch": 11.236858379715523, + "grad_norm": 0.40625, + "learning_rate": 3.583161915396971e-05, + "loss": 0.8477, + "step": 9085 + }, + { + "epoch": 11.2430426716141, + "grad_norm": 0.421875, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.8554, + "step": 9090 + }, + { + "epoch": 11.249226963512678, + "grad_norm": 0.419921875, + "learning_rate": 3.561099220466111e-05, + "loss": 0.8456, + "step": 9095 + }, + { + "epoch": 11.255411255411255, + "grad_norm": 0.435546875, + "learning_rate": 3.550087890253544e-05, + "loss": 0.8625, + "step": 9100 + }, + { + "epoch": 11.261595547309833, + "grad_norm": 0.427734375, + "learning_rate": 3.539089935331294e-05, + "loss": 0.8504, + "step": 9105 + }, + { + "epoch": 11.26777983920841, + "grad_norm": 0.419921875, + "learning_rate": 3.5281053785059925e-05, + "loss": 0.8567, + "step": 9110 + }, + { + "epoch": 11.273964131106988, + "grad_norm": 0.439453125, + "learning_rate": 3.5171342425565055e-05, + "loss": 0.8728, + "step": 9115 + }, + { + "epoch": 11.280148423005565, + "grad_norm": 0.439453125, + "learning_rate": 3.506176550233863e-05, + "loss": 0.855, + "step": 9120 + }, + { + "epoch": 11.286332714904143, + "grad_norm": 0.443359375, + "learning_rate": 3.495232324261206e-05, + "loss": 0.8537, + "step": 9125 + }, + { + "epoch": 11.29251700680272, + "grad_norm": 0.42578125, + "learning_rate": 3.484301587333772e-05, + "loss": 0.8524, + "step": 9130 + }, + { + "epoch": 11.2987012987013, + "grad_norm": 0.44140625, + "learning_rate": 3.473384362118794e-05, + "loss": 0.8689, + "step": 9135 + }, + { + "epoch": 11.304885590599877, + "grad_norm": 0.435546875, + "learning_rate": 3.462480671255515e-05, + "loss": 0.8683, + "step": 9140 + }, + { + "epoch": 11.311069882498455, + "grad_norm": 0.447265625, + "learning_rate": 3.4515905373551016e-05, + "loss": 0.863, + "step": 9145 + }, + { + "epoch": 11.317254174397032, + "grad_norm": 0.443359375, + "learning_rate": 3.440713983000601e-05, + "loss": 0.8485, + "step": 9150 + }, + { + "epoch": 11.32343846629561, + "grad_norm": 0.41796875, + "learning_rate": 3.42985103074691e-05, + "loss": 0.8589, + "step": 9155 + }, + { + "epoch": 11.329622758194187, + "grad_norm": 0.43359375, + "learning_rate": 3.419001703120709e-05, + "loss": 0.8566, + "step": 9160 + }, + { + "epoch": 11.335807050092765, + "grad_norm": 0.42578125, + "learning_rate": 3.4081660226204357e-05, + "loss": 0.8617, + "step": 9165 + }, + { + "epoch": 11.341991341991342, + "grad_norm": 0.45703125, + "learning_rate": 3.397344011716216e-05, + "loss": 0.8579, + "step": 9170 + }, + { + "epoch": 11.34817563388992, + "grad_norm": 0.443359375, + "learning_rate": 3.386535692849838e-05, + "loss": 0.8511, + "step": 9175 + }, + { + "epoch": 11.354359925788497, + "grad_norm": 0.478515625, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.8612, + "step": 9180 + }, + { + "epoch": 11.360544217687075, + "grad_norm": 0.43359375, + "learning_rate": 3.364960220855723e-05, + "loss": 0.8714, + "step": 9185 + }, + { + "epoch": 11.366728509585652, + "grad_norm": 0.447265625, + "learning_rate": 3.354193112469407e-05, + "loss": 0.8527, + "step": 9190 + }, + { + "epoch": 11.37291280148423, + "grad_norm": 0.43359375, + "learning_rate": 3.34343978560367e-05, + "loss": 0.859, + "step": 9195 + }, + { + "epoch": 11.379097093382807, + "grad_norm": 0.431640625, + "learning_rate": 3.332700262557864e-05, + "loss": 0.857, + "step": 9200 + }, + { + "epoch": 11.385281385281385, + "grad_norm": 0.416015625, + "learning_rate": 3.321974565602722e-05, + "loss": 0.861, + "step": 9205 + }, + { + "epoch": 11.391465677179962, + "grad_norm": 0.423828125, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.8597, + "step": 9210 + }, + { + "epoch": 11.397649969078541, + "grad_norm": 0.435546875, + "learning_rate": 3.300564738903926e-05, + "loss": 0.8571, + "step": 9215 + }, + { + "epoch": 11.403834260977119, + "grad_norm": 0.44140625, + "learning_rate": 3.289880653558188e-05, + "loss": 0.8648, + "step": 9220 + }, + { + "epoch": 11.410018552875696, + "grad_norm": 0.431640625, + "learning_rate": 3.2792104830988515e-05, + "loss": 0.8598, + "step": 9225 + }, + { + "epoch": 11.416202844774274, + "grad_norm": 0.419921875, + "learning_rate": 3.2685542496528185e-05, + "loss": 0.8615, + "step": 9230 + }, + { + "epoch": 11.422387136672851, + "grad_norm": 0.44140625, + "learning_rate": 3.257911975318109e-05, + "loss": 0.8524, + "step": 9235 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.466796875, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.8651, + "step": 9240 + }, + { + "epoch": 11.434755720470006, + "grad_norm": 0.423828125, + "learning_rate": 3.236669392229888e-05, + "loss": 0.8592, + "step": 9245 + }, + { + "epoch": 11.440940012368584, + "grad_norm": 0.423828125, + "learning_rate": 3.2260691275274835e-05, + "loss": 0.8623, + "step": 9250 + }, + { + "epoch": 11.447124304267161, + "grad_norm": 0.4453125, + "learning_rate": 3.2154829100385e-05, + "loss": 0.8588, + "step": 9255 + }, + { + "epoch": 11.453308596165739, + "grad_norm": 0.443359375, + "learning_rate": 3.204910761715763e-05, + "loss": 0.8525, + "step": 9260 + }, + { + "epoch": 11.459492888064316, + "grad_norm": 0.423828125, + "learning_rate": 3.194352704482899e-05, + "loss": 0.8494, + "step": 9265 + }, + { + "epoch": 11.465677179962894, + "grad_norm": 0.42578125, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.8531, + "step": 9270 + }, + { + "epoch": 11.471861471861471, + "grad_norm": 0.431640625, + "learning_rate": 3.173278950835227e-05, + "loss": 0.8532, + "step": 9275 + }, + { + "epoch": 11.478045763760049, + "grad_norm": 0.443359375, + "learning_rate": 3.162763298121408e-05, + "loss": 0.8603, + "step": 9280 + }, + { + "epoch": 11.484230055658626, + "grad_norm": 0.423828125, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.8561, + "step": 9285 + }, + { + "epoch": 11.490414347557206, + "grad_norm": 0.42578125, + "learning_rate": 3.1417745499461934e-05, + "loss": 0.8618, + "step": 9290 + }, + { + "epoch": 11.496598639455783, + "grad_norm": 0.44921875, + "learning_rate": 3.131301498009514e-05, + "loss": 0.8649, + "step": 9295 + }, + { + "epoch": 11.50278293135436, + "grad_norm": 0.43359375, + "learning_rate": 3.120842689807468e-05, + "loss": 0.8618, + "step": 9300 + }, + { + "epoch": 11.508967223252938, + "grad_norm": 0.4296875, + "learning_rate": 3.110398147028666e-05, + "loss": 0.8535, + "step": 9305 + }, + { + "epoch": 11.515151515151516, + "grad_norm": 0.427734375, + "learning_rate": 3.099967891332132e-05, + "loss": 0.8454, + "step": 9310 + }, + { + "epoch": 11.521335807050093, + "grad_norm": 0.43359375, + "learning_rate": 3.089551944347255e-05, + "loss": 0.8551, + "step": 9315 + }, + { + "epoch": 11.52752009894867, + "grad_norm": 0.447265625, + "learning_rate": 3.079150327673766e-05, + "loss": 0.8603, + "step": 9320 + }, + { + "epoch": 11.533704390847248, + "grad_norm": 0.435546875, + "learning_rate": 3.0687630628816656e-05, + "loss": 0.8571, + "step": 9325 + }, + { + "epoch": 11.539888682745826, + "grad_norm": 0.431640625, + "learning_rate": 3.058390171511196e-05, + "loss": 0.8548, + "step": 9330 + }, + { + "epoch": 11.546072974644403, + "grad_norm": 0.4140625, + "learning_rate": 3.0480316750728012e-05, + "loss": 0.8556, + "step": 9335 + }, + { + "epoch": 11.55225726654298, + "grad_norm": 0.421875, + "learning_rate": 3.0376875950470617e-05, + "loss": 0.8581, + "step": 9340 + }, + { + "epoch": 11.558441558441558, + "grad_norm": 0.42578125, + "learning_rate": 3.0273579528846762e-05, + "loss": 0.8545, + "step": 9345 + }, + { + "epoch": 11.564625850340136, + "grad_norm": 0.431640625, + "learning_rate": 3.0170427700063873e-05, + "loss": 0.8601, + "step": 9350 + }, + { + "epoch": 11.570810142238713, + "grad_norm": 0.4296875, + "learning_rate": 3.0067420678029702e-05, + "loss": 0.8641, + "step": 9355 + }, + { + "epoch": 11.57699443413729, + "grad_norm": 0.41796875, + "learning_rate": 2.996455867635155e-05, + "loss": 0.8537, + "step": 9360 + }, + { + "epoch": 11.583178726035868, + "grad_norm": 0.419921875, + "learning_rate": 2.9861841908336098e-05, + "loss": 0.8596, + "step": 9365 + }, + { + "epoch": 11.589363017934446, + "grad_norm": 0.416015625, + "learning_rate": 2.9759270586988865e-05, + "loss": 0.8543, + "step": 9370 + }, + { + "epoch": 11.595547309833025, + "grad_norm": 0.43359375, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.8673, + "step": 9375 + }, + { + "epoch": 11.601731601731602, + "grad_norm": 0.447265625, + "learning_rate": 2.9554565134812294e-05, + "loss": 0.8609, + "step": 9380 + }, + { + "epoch": 11.60791589363018, + "grad_norm": 0.431640625, + "learning_rate": 2.9452431428484062e-05, + "loss": 0.853, + "step": 9385 + }, + { + "epoch": 11.614100185528757, + "grad_norm": 0.431640625, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.8552, + "step": 9390 + }, + { + "epoch": 11.620284477427335, + "grad_norm": 0.431640625, + "learning_rate": 2.9248603114329222e-05, + "loss": 0.8633, + "step": 9395 + }, + { + "epoch": 11.626468769325912, + "grad_norm": 0.439453125, + "learning_rate": 2.9146908929184713e-05, + "loss": 0.8658, + "step": 9400 + }, + { + "epoch": 11.63265306122449, + "grad_norm": 0.427734375, + "learning_rate": 2.9045361673276872e-05, + "loss": 0.8517, + "step": 9405 + }, + { + "epoch": 11.638837353123067, + "grad_norm": 0.4296875, + "learning_rate": 2.894396155718585e-05, + "loss": 0.8599, + "step": 9410 + }, + { + "epoch": 11.645021645021645, + "grad_norm": 0.435546875, + "learning_rate": 2.8842708791186835e-05, + "loss": 0.8648, + "step": 9415 + }, + { + "epoch": 11.651205936920222, + "grad_norm": 0.4140625, + "learning_rate": 2.874160358524931e-05, + "loss": 0.8556, + "step": 9420 + }, + { + "epoch": 11.6573902288188, + "grad_norm": 0.431640625, + "learning_rate": 2.8640646149036898e-05, + "loss": 0.8598, + "step": 9425 + }, + { + "epoch": 11.663574520717377, + "grad_norm": 0.46875, + "learning_rate": 2.853983669190664e-05, + "loss": 0.8665, + "step": 9430 + }, + { + "epoch": 11.669758812615955, + "grad_norm": 0.421875, + "learning_rate": 2.8439175422908824e-05, + "loss": 0.8515, + "step": 9435 + }, + { + "epoch": 11.675943104514532, + "grad_norm": 0.42578125, + "learning_rate": 2.8338662550786443e-05, + "loss": 0.8591, + "step": 9440 + }, + { + "epoch": 11.682127396413112, + "grad_norm": 0.416015625, + "learning_rate": 2.823829828397465e-05, + "loss": 0.8331, + "step": 9445 + }, + { + "epoch": 11.688311688311689, + "grad_norm": 0.44921875, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.8499, + "step": 9450 + }, + { + "epoch": 11.694495980210267, + "grad_norm": 0.443359375, + "learning_rate": 2.8038016398482593e-05, + "loss": 0.857, + "step": 9455 + }, + { + "epoch": 11.700680272108844, + "grad_norm": 0.431640625, + "learning_rate": 2.7938099195130153e-05, + "loss": 0.8609, + "step": 9460 + }, + { + "epoch": 11.706864564007422, + "grad_norm": 0.44140625, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.853, + "step": 9465 + }, + { + "epoch": 11.713048855905999, + "grad_norm": 0.435546875, + "learning_rate": 2.7738713303211982e-05, + "loss": 0.8501, + "step": 9470 + }, + { + "epoch": 11.719233147804577, + "grad_norm": 0.4375, + "learning_rate": 2.763924502811609e-05, + "loss": 0.8579, + "step": 9475 + }, + { + "epoch": 11.725417439703154, + "grad_norm": 0.4375, + "learning_rate": 2.753992680872457e-05, + "loss": 0.8647, + "step": 9480 + }, + { + "epoch": 11.731601731601732, + "grad_norm": 0.427734375, + "learning_rate": 2.7440758850995318e-05, + "loss": 0.8558, + "step": 9485 + }, + { + "epoch": 11.737786023500309, + "grad_norm": 0.423828125, + "learning_rate": 2.7341741360574548e-05, + "loss": 0.8538, + "step": 9490 + }, + { + "epoch": 11.743970315398887, + "grad_norm": 0.43359375, + "learning_rate": 2.7242874542796482e-05, + "loss": 0.8588, + "step": 9495 + }, + { + "epoch": 11.750154607297464, + "grad_norm": 0.451171875, + "learning_rate": 2.7144158602682924e-05, + "loss": 0.8642, + "step": 9500 + }, + { + "epoch": 11.756338899196042, + "grad_norm": 0.451171875, + "learning_rate": 2.704559374494272e-05, + "loss": 0.8658, + "step": 9505 + }, + { + "epoch": 11.762523191094619, + "grad_norm": 0.43359375, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.8672, + "step": 9510 + }, + { + "epoch": 11.768707482993197, + "grad_norm": 0.41015625, + "learning_rate": 2.6848918093851104e-05, + "loss": 0.8565, + "step": 9515 + }, + { + "epoch": 11.774891774891774, + "grad_norm": 0.4140625, + "learning_rate": 2.6750807708349267e-05, + "loss": 0.8646, + "step": 9520 + }, + { + "epoch": 11.781076066790352, + "grad_norm": 0.423828125, + "learning_rate": 2.665284922091912e-05, + "loss": 0.8566, + "step": 9525 + }, + { + "epoch": 11.78726035868893, + "grad_norm": 0.431640625, + "learning_rate": 2.6555042834698773e-05, + "loss": 0.853, + "step": 9530 + }, + { + "epoch": 11.793444650587508, + "grad_norm": 0.42578125, + "learning_rate": 2.6457388752511025e-05, + "loss": 0.8578, + "step": 9535 + }, + { + "epoch": 11.799628942486086, + "grad_norm": 0.435546875, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.8506, + "step": 9540 + }, + { + "epoch": 11.805813234384663, + "grad_norm": 0.431640625, + "learning_rate": 2.626253830994455e-05, + "loss": 0.8466, + "step": 9545 + }, + { + "epoch": 11.81199752628324, + "grad_norm": 0.455078125, + "learning_rate": 2.6165342353630428e-05, + "loss": 0.8668, + "step": 9550 + }, + { + "epoch": 11.818181818181818, + "grad_norm": 0.439453125, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.8625, + "step": 9555 + }, + { + "epoch": 11.824366110080396, + "grad_norm": 0.43359375, + "learning_rate": 2.5971409978724458e-05, + "loss": 0.8743, + "step": 9560 + }, + { + "epoch": 11.830550401978973, + "grad_norm": 0.416015625, + "learning_rate": 2.5874673962293373e-05, + "loss": 0.8559, + "step": 9565 + }, + { + "epoch": 11.83673469387755, + "grad_norm": 0.419921875, + "learning_rate": 2.577809166078716e-05, + "loss": 0.8542, + "step": 9570 + }, + { + "epoch": 11.842918985776128, + "grad_norm": 0.419921875, + "learning_rate": 2.5681663274490107e-05, + "loss": 0.8476, + "step": 9575 + }, + { + "epoch": 11.849103277674706, + "grad_norm": 0.421875, + "learning_rate": 2.558538900336741e-05, + "loss": 0.8544, + "step": 9580 + }, + { + "epoch": 11.855287569573283, + "grad_norm": 0.416015625, + "learning_rate": 2.548926904706459e-05, + "loss": 0.8608, + "step": 9585 + }, + { + "epoch": 11.86147186147186, + "grad_norm": 0.46484375, + "learning_rate": 2.5393303604907205e-05, + "loss": 0.8591, + "step": 9590 + }, + { + "epoch": 11.867656153370438, + "grad_norm": 0.435546875, + "learning_rate": 2.529749287590042e-05, + "loss": 0.8516, + "step": 9595 + }, + { + "epoch": 11.873840445269018, + "grad_norm": 0.423828125, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.8549, + "step": 9600 + }, + { + "epoch": 11.880024737167595, + "grad_norm": 0.443359375, + "learning_rate": 2.5106336351754578e-05, + "loss": 0.8498, + "step": 9605 + }, + { + "epoch": 11.886209029066173, + "grad_norm": 0.421875, + "learning_rate": 2.5010990953019975e-05, + "loss": 0.8661, + "step": 9610 + }, + { + "epoch": 11.89239332096475, + "grad_norm": 0.455078125, + "learning_rate": 2.4915801060244092e-05, + "loss": 0.8668, + "step": 9615 + }, + { + "epoch": 11.898577612863328, + "grad_norm": 0.392578125, + "learning_rate": 2.4820766870823807e-05, + "loss": 0.8547, + "step": 9620 + }, + { + "epoch": 11.904761904761905, + "grad_norm": 0.439453125, + "learning_rate": 2.4725888581833058e-05, + "loss": 0.852, + "step": 9625 + }, + { + "epoch": 11.910946196660483, + "grad_norm": 0.431640625, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.8616, + "step": 9630 + }, + { + "epoch": 11.91713048855906, + "grad_norm": 0.43359375, + "learning_rate": 2.4536600491819318e-05, + "loss": 0.8677, + "step": 9635 + }, + { + "epoch": 11.923314780457638, + "grad_norm": 0.408203125, + "learning_rate": 2.4442191083326195e-05, + "loss": 0.8516, + "step": 9640 + }, + { + "epoch": 11.929499072356215, + "grad_norm": 0.42578125, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.8636, + "step": 9645 + }, + { + "epoch": 11.935683364254793, + "grad_norm": 0.4296875, + "learning_rate": 2.425384251825882e-05, + "loss": 0.8504, + "step": 9650 + }, + { + "epoch": 11.94186765615337, + "grad_norm": 0.447265625, + "learning_rate": 2.4159903752266156e-05, + "loss": 0.8598, + "step": 9655 + }, + { + "epoch": 11.948051948051948, + "grad_norm": 0.4375, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.8374, + "step": 9660 + }, + { + "epoch": 11.954236239950525, + "grad_norm": 0.423828125, + "learning_rate": 2.3972498227374342e-05, + "loss": 0.8554, + "step": 9665 + }, + { + "epoch": 11.960420531849103, + "grad_norm": 0.419921875, + "learning_rate": 2.387903185710115e-05, + "loss": 0.8525, + "step": 9670 + }, + { + "epoch": 11.96660482374768, + "grad_norm": 0.462890625, + "learning_rate": 2.3785723340149134e-05, + "loss": 0.8618, + "step": 9675 + }, + { + "epoch": 11.972789115646258, + "grad_norm": 0.43359375, + "learning_rate": 2.3692572870013718e-05, + "loss": 0.8645, + "step": 9680 + }, + { + "epoch": 11.978973407544837, + "grad_norm": 0.435546875, + "learning_rate": 2.359958063986256e-05, + "loss": 0.8472, + "step": 9685 + }, + { + "epoch": 11.985157699443414, + "grad_norm": 0.41796875, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.8537, + "step": 9690 + }, + { + "epoch": 11.991341991341992, + "grad_norm": 0.431640625, + "learning_rate": 2.3414071670542703e-05, + "loss": 0.8494, + "step": 9695 + }, + { + "epoch": 11.99752628324057, + "grad_norm": 0.435546875, + "learning_rate": 2.3321555316067045e-05, + "loss": 0.8442, + "step": 9700 + }, + { + "epoch": 12.0, + "eval_loss": 2.5192575454711914, + "eval_runtime": 0.5399, + "eval_samples_per_second": 18.52, + "eval_steps_per_second": 1.852, + "step": 9702 + }, + { + "epoch": 12.003710575139147, + "grad_norm": 0.42578125, + "learning_rate": 2.3229197970960924e-05, + "loss": 0.8504, + "step": 9705 + }, + { + "epoch": 12.009894867037724, + "grad_norm": 0.40625, + "learning_rate": 2.313699982674736e-05, + "loss": 0.8496, + "step": 9710 + }, + { + "epoch": 12.016079158936302, + "grad_norm": 0.447265625, + "learning_rate": 2.3044961074619165e-05, + "loss": 0.8635, + "step": 9715 + }, + { + "epoch": 12.02226345083488, + "grad_norm": 0.46484375, + "learning_rate": 2.295308190543859e-05, + "loss": 0.8556, + "step": 9720 + }, + { + "epoch": 12.028447742733457, + "grad_norm": 0.43359375, + "learning_rate": 2.2861362509737072e-05, + "loss": 0.8588, + "step": 9725 + }, + { + "epoch": 12.034632034632034, + "grad_norm": 0.412109375, + "learning_rate": 2.276980307771458e-05, + "loss": 0.858, + "step": 9730 + }, + { + "epoch": 12.040816326530612, + "grad_norm": 0.416015625, + "learning_rate": 2.26784037992395e-05, + "loss": 0.848, + "step": 9735 + }, + { + "epoch": 12.04700061842919, + "grad_norm": 0.43359375, + "learning_rate": 2.2587164863847975e-05, + "loss": 0.8624, + "step": 9740 + }, + { + "epoch": 12.053184910327767, + "grad_norm": 0.4375, + "learning_rate": 2.249608646074375e-05, + "loss": 0.8623, + "step": 9745 + }, + { + "epoch": 12.059369202226344, + "grad_norm": 0.431640625, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.8504, + "step": 9750 + }, + { + "epoch": 12.065553494124922, + "grad_norm": 0.478515625, + "learning_rate": 2.2314412006547125e-05, + "loss": 0.8403, + "step": 9755 + }, + { + "epoch": 12.071737786023501, + "grad_norm": 0.4453125, + "learning_rate": 2.222381633219608e-05, + "loss": 0.859, + "step": 9760 + }, + { + "epoch": 12.077922077922079, + "grad_norm": 0.416015625, + "learning_rate": 2.2133381943614207e-05, + "loss": 0.8639, + "step": 9765 + }, + { + "epoch": 12.084106369820656, + "grad_norm": 0.447265625, + "learning_rate": 2.204310902833685e-05, + "loss": 0.859, + "step": 9770 + }, + { + "epoch": 12.090290661719234, + "grad_norm": 0.451171875, + "learning_rate": 2.1952997773564467e-05, + "loss": 0.8598, + "step": 9775 + }, + { + "epoch": 12.096474953617811, + "grad_norm": 0.423828125, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.8472, + "step": 9780 + }, + { + "epoch": 12.102659245516389, + "grad_norm": 0.435546875, + "learning_rate": 2.1773260992659728e-05, + "loss": 0.8557, + "step": 9785 + }, + { + "epoch": 12.108843537414966, + "grad_norm": 0.443359375, + "learning_rate": 2.1683635839250537e-05, + "loss": 0.8597, + "step": 9790 + }, + { + "epoch": 12.115027829313544, + "grad_norm": 0.41796875, + "learning_rate": 2.159417309179189e-05, + "loss": 0.8594, + "step": 9795 + }, + { + "epoch": 12.121212121212121, + "grad_norm": 0.43359375, + "learning_rate": 2.15048729358041e-05, + "loss": 0.859, + "step": 9800 + }, + { + "epoch": 12.127396413110699, + "grad_norm": 0.42578125, + "learning_rate": 2.141573555647042e-05, + "loss": 0.8495, + "step": 9805 + }, + { + "epoch": 12.133580705009276, + "grad_norm": 0.43359375, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.8567, + "step": 9810 + }, + { + "epoch": 12.139764996907854, + "grad_norm": 0.46875, + "learning_rate": 2.123794986681017e-05, + "loss": 0.8475, + "step": 9815 + }, + { + "epoch": 12.145949288806431, + "grad_norm": 0.447265625, + "learning_rate": 2.114930192516076e-05, + "loss": 0.8616, + "step": 9820 + }, + { + "epoch": 12.152133580705009, + "grad_norm": 0.421875, + "learning_rate": 2.106081749751897e-05, + "loss": 0.8602, + "step": 9825 + }, + { + "epoch": 12.158317872603586, + "grad_norm": 0.427734375, + "learning_rate": 2.097249676737648e-05, + "loss": 0.8583, + "step": 9830 + }, + { + "epoch": 12.164502164502165, + "grad_norm": 0.431640625, + "learning_rate": 2.0884339917885433e-05, + "loss": 0.8533, + "step": 9835 + }, + { + "epoch": 12.170686456400743, + "grad_norm": 0.427734375, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.8532, + "step": 9840 + }, + { + "epoch": 12.17687074829932, + "grad_norm": 0.42578125, + "learning_rate": 2.0708518591766825e-05, + "loss": 0.8622, + "step": 9845 + }, + { + "epoch": 12.183055040197898, + "grad_norm": 0.41015625, + "learning_rate": 2.0620854479742834e-05, + "loss": 0.8548, + "step": 9850 + }, + { + "epoch": 12.189239332096475, + "grad_norm": 0.427734375, + "learning_rate": 2.05333549775768e-05, + "loss": 0.8512, + "step": 9855 + }, + { + "epoch": 12.195423623995053, + "grad_norm": 0.4296875, + "learning_rate": 2.044602026671786e-05, + "loss": 0.8632, + "step": 9860 + }, + { + "epoch": 12.20160791589363, + "grad_norm": 0.427734375, + "learning_rate": 2.0358850528273455e-05, + "loss": 0.856, + "step": 9865 + }, + { + "epoch": 12.207792207792208, + "grad_norm": 0.419921875, + "learning_rate": 2.027184594300898e-05, + "loss": 0.856, + "step": 9870 + }, + { + "epoch": 12.213976499690785, + "grad_norm": 0.4375, + "learning_rate": 2.018500669134723e-05, + "loss": 0.8541, + "step": 9875 + }, + { + "epoch": 12.220160791589363, + "grad_norm": 0.42578125, + "learning_rate": 2.0098332953368272e-05, + "loss": 0.86, + "step": 9880 + }, + { + "epoch": 12.22634508348794, + "grad_norm": 0.421875, + "learning_rate": 2.0011824908808808e-05, + "loss": 0.8557, + "step": 9885 + }, + { + "epoch": 12.232529375386518, + "grad_norm": 0.4140625, + "learning_rate": 1.9925482737062085e-05, + "loss": 0.8758, + "step": 9890 + }, + { + "epoch": 12.238713667285095, + "grad_norm": 0.412109375, + "learning_rate": 1.9839306617177243e-05, + "loss": 0.8628, + "step": 9895 + }, + { + "epoch": 12.244897959183673, + "grad_norm": 0.40625, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.854, + "step": 9900 + }, + { + "epoch": 12.25108225108225, + "grad_norm": 0.46484375, + "learning_rate": 1.966745324746806e-05, + "loss": 0.8571, + "step": 9905 + }, + { + "epoch": 12.257266542980828, + "grad_norm": 0.431640625, + "learning_rate": 1.9581776354018854e-05, + "loss": 0.8629, + "step": 9910 + }, + { + "epoch": 12.263450834879407, + "grad_norm": 0.43359375, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.8587, + "step": 9915 + }, + { + "epoch": 12.269635126777985, + "grad_norm": 0.427734375, + "learning_rate": 1.941092303827896e-05, + "loss": 0.8526, + "step": 9920 + }, + { + "epoch": 12.275819418676562, + "grad_norm": 0.435546875, + "learning_rate": 1.9325746970289627e-05, + "loss": 0.8631, + "step": 9925 + }, + { + "epoch": 12.28200371057514, + "grad_norm": 0.41796875, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.8637, + "step": 9930 + }, + { + "epoch": 12.288188002473717, + "grad_norm": 0.421875, + "learning_rate": 1.9155896897226988e-05, + "loss": 0.8544, + "step": 9935 + }, + { + "epoch": 12.294372294372295, + "grad_norm": 0.453125, + "learning_rate": 1.9071223244374614e-05, + "loss": 0.8676, + "step": 9940 + }, + { + "epoch": 12.300556586270872, + "grad_norm": 0.431640625, + "learning_rate": 1.89867174148763e-05, + "loss": 0.8686, + "step": 9945 + }, + { + "epoch": 12.30674087816945, + "grad_norm": 0.416015625, + "learning_rate": 1.8902379583973208e-05, + "loss": 0.8476, + "step": 9950 + }, + { + "epoch": 12.312925170068027, + "grad_norm": 0.4375, + "learning_rate": 1.8818209926558082e-05, + "loss": 0.8516, + "step": 9955 + }, + { + "epoch": 12.319109461966605, + "grad_norm": 0.43359375, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.8567, + "step": 9960 + }, + { + "epoch": 12.325293753865182, + "grad_norm": 0.42578125, + "learning_rate": 1.8650375830018785e-05, + "loss": 0.8539, + "step": 9965 + }, + { + "epoch": 12.33147804576376, + "grad_norm": 0.42578125, + "learning_rate": 1.856671173893497e-05, + "loss": 0.8477, + "step": 9970 + }, + { + "epoch": 12.337662337662337, + "grad_norm": 0.431640625, + "learning_rate": 1.8483216517419142e-05, + "loss": 0.8546, + "step": 9975 + }, + { + "epoch": 12.343846629560915, + "grad_norm": 0.421875, + "learning_rate": 1.839989033861673e-05, + "loss": 0.8508, + "step": 9980 + }, + { + "epoch": 12.350030921459492, + "grad_norm": 0.42578125, + "learning_rate": 1.8316733375322637e-05, + "loss": 0.8376, + "step": 9985 + }, + { + "epoch": 12.35621521335807, + "grad_norm": 0.443359375, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.8456, + "step": 9990 + }, + { + "epoch": 12.362399505256649, + "grad_norm": 0.4375, + "learning_rate": 1.8150927784684e-05, + "loss": 0.8578, + "step": 9995 + }, + { + "epoch": 12.368583797155226, + "grad_norm": 0.447265625, + "learning_rate": 1.8068279501173335e-05, + "loss": 0.8569, + "step": 10000 + }, + { + "epoch": 12.374768089053804, + "grad_norm": 0.4375, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.8492, + "step": 10005 + }, + { + "epoch": 12.380952380952381, + "grad_norm": 0.447265625, + "learning_rate": 1.790349281471445e-05, + "loss": 0.8583, + "step": 10010 + }, + { + "epoch": 12.387136672850959, + "grad_norm": 0.43359375, + "learning_rate": 1.782135475348714e-05, + "loss": 0.8493, + "step": 10015 + }, + { + "epoch": 12.393320964749536, + "grad_norm": 0.4453125, + "learning_rate": 1.773938710748706e-05, + "loss": 0.8574, + "step": 10020 + }, + { + "epoch": 12.399505256648114, + "grad_norm": 0.43359375, + "learning_rate": 1.765759004669183e-05, + "loss": 0.8448, + "step": 10025 + }, + { + "epoch": 12.405689548546691, + "grad_norm": 0.41796875, + "learning_rate": 1.757596374072543e-05, + "loss": 0.8523, + "step": 10030 + }, + { + "epoch": 12.411873840445269, + "grad_norm": 0.443359375, + "learning_rate": 1.7494508358857677e-05, + "loss": 0.8663, + "step": 10035 + }, + { + "epoch": 12.418058132343846, + "grad_norm": 0.41796875, + "learning_rate": 1.741322407000391e-05, + "loss": 0.8499, + "step": 10040 + }, + { + "epoch": 12.424242424242424, + "grad_norm": 0.416015625, + "learning_rate": 1.7332111042724775e-05, + "loss": 0.8595, + "step": 10045 + }, + { + "epoch": 12.430426716141001, + "grad_norm": 0.427734375, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.8644, + "step": 10050 + }, + { + "epoch": 12.436611008039579, + "grad_norm": 0.408203125, + "learning_rate": 1.7170399445356532e-05, + "loss": 0.8636, + "step": 10055 + }, + { + "epoch": 12.442795299938156, + "grad_norm": 0.42578125, + "learning_rate": 1.70898012106115e-05, + "loss": 0.8547, + "step": 10060 + }, + { + "epoch": 12.448979591836734, + "grad_norm": 0.4140625, + "learning_rate": 1.700937490812844e-05, + "loss": 0.8547, + "step": 10065 + }, + { + "epoch": 12.455163883735313, + "grad_norm": 0.41796875, + "learning_rate": 1.692912070468874e-05, + "loss": 0.8514, + "step": 10070 + }, + { + "epoch": 12.46134817563389, + "grad_norm": 0.431640625, + "learning_rate": 1.684903876671685e-05, + "loss": 0.849, + "step": 10075 + }, + { + "epoch": 12.467532467532468, + "grad_norm": 0.44140625, + "learning_rate": 1.676912926028007e-05, + "loss": 0.8497, + "step": 10080 + }, + { + "epoch": 12.473716759431046, + "grad_norm": 0.443359375, + "learning_rate": 1.668939235108802e-05, + "loss": 0.8668, + "step": 10085 + }, + { + "epoch": 12.479901051329623, + "grad_norm": 0.4140625, + "learning_rate": 1.660982820449247e-05, + "loss": 0.8532, + "step": 10090 + }, + { + "epoch": 12.4860853432282, + "grad_norm": 0.419921875, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.8464, + "step": 10095 + }, + { + "epoch": 12.492269635126778, + "grad_norm": 0.427734375, + "learning_rate": 1.6451218858706374e-05, + "loss": 0.8589, + "step": 10100 + }, + { + "epoch": 12.498453927025356, + "grad_norm": 0.412109375, + "learning_rate": 1.637217398842663e-05, + "loss": 0.8584, + "step": 10105 + }, + { + "epoch": 12.504638218923933, + "grad_norm": 0.435546875, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.8587, + "step": 10110 + }, + { + "epoch": 12.51082251082251, + "grad_norm": 0.40234375, + "learning_rate": 1.6214604672676704e-05, + "loss": 0.8626, + "step": 10115 + }, + { + "epoch": 12.517006802721088, + "grad_norm": 0.421875, + "learning_rate": 1.6136080553960687e-05, + "loss": 0.8634, + "step": 10120 + }, + { + "epoch": 12.523191094619666, + "grad_norm": 0.44140625, + "learning_rate": 1.6057730345253065e-05, + "loss": 0.8508, + "step": 10125 + }, + { + "epoch": 12.529375386518243, + "grad_norm": 0.4453125, + "learning_rate": 1.5979554209030024e-05, + "loss": 0.8541, + "step": 10130 + }, + { + "epoch": 12.53555967841682, + "grad_norm": 0.423828125, + "learning_rate": 1.5901552307406653e-05, + "loss": 0.8523, + "step": 10135 + }, + { + "epoch": 12.541743970315398, + "grad_norm": 0.416015625, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.8674, + "step": 10140 + }, + { + "epoch": 12.547928262213976, + "grad_norm": 0.453125, + "learning_rate": 1.5746071854612797e-05, + "loss": 0.859, + "step": 10145 + }, + { + "epoch": 12.554112554112555, + "grad_norm": 0.4375, + "learning_rate": 1.5668593625864715e-05, + "loss": 0.8582, + "step": 10150 + }, + { + "epoch": 12.560296846011132, + "grad_norm": 0.427734375, + "learning_rate": 1.5591290276560466e-05, + "loss": 0.857, + "step": 10155 + }, + { + "epoch": 12.56648113790971, + "grad_norm": 0.408203125, + "learning_rate": 1.5514161967005337e-05, + "loss": 0.8556, + "step": 10160 + }, + { + "epoch": 12.572665429808287, + "grad_norm": 0.443359375, + "learning_rate": 1.543720885714157e-05, + "loss": 0.854, + "step": 10165 + }, + { + "epoch": 12.578849721706865, + "grad_norm": 0.435546875, + "learning_rate": 1.536043110654809e-05, + "loss": 0.8486, + "step": 10170 + }, + { + "epoch": 12.585034013605442, + "grad_norm": 0.4375, + "learning_rate": 1.528382887444022e-05, + "loss": 0.8531, + "step": 10175 + }, + { + "epoch": 12.59121830550402, + "grad_norm": 0.419921875, + "learning_rate": 1.5207402319669306e-05, + "loss": 0.8527, + "step": 10180 + }, + { + "epoch": 12.597402597402597, + "grad_norm": 0.423828125, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.8534, + "step": 10185 + }, + { + "epoch": 12.603586889301175, + "grad_norm": 0.439453125, + "learning_rate": 1.505507687572173e-05, + "loss": 0.8519, + "step": 10190 + }, + { + "epoch": 12.609771181199752, + "grad_norm": 0.421875, + "learning_rate": 1.4979178302424867e-05, + "loss": 0.8637, + "step": 10195 + }, + { + "epoch": 12.61595547309833, + "grad_norm": 0.43359375, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.8663, + "step": 10200 + }, + { + "epoch": 12.622139764996907, + "grad_norm": 0.4296875, + "learning_rate": 1.4827910240145426e-05, + "loss": 0.858, + "step": 10205 + }, + { + "epoch": 12.628324056895485, + "grad_norm": 0.4296875, + "learning_rate": 1.4752541064849946e-05, + "loss": 0.8537, + "step": 10210 + }, + { + "epoch": 12.634508348794062, + "grad_norm": 0.412109375, + "learning_rate": 1.4677348668631763e-05, + "loss": 0.8485, + "step": 10215 + }, + { + "epoch": 12.64069264069264, + "grad_norm": 0.412109375, + "learning_rate": 1.4602333207418651e-05, + "loss": 0.8529, + "step": 10220 + }, + { + "epoch": 12.64687693259122, + "grad_norm": 0.439453125, + "learning_rate": 1.4527494836771438e-05, + "loss": 0.8639, + "step": 10225 + }, + { + "epoch": 12.653061224489797, + "grad_norm": 0.451171875, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.8531, + "step": 10230 + }, + { + "epoch": 12.659245516388374, + "grad_norm": 0.412109375, + "learning_rate": 1.4378349987581307e-05, + "loss": 0.8619, + "step": 10235 + }, + { + "epoch": 12.665429808286952, + "grad_norm": 0.42578125, + "learning_rate": 1.4304043818322565e-05, + "loss": 0.8556, + "step": 10240 + }, + { + "epoch": 12.67161410018553, + "grad_norm": 0.416015625, + "learning_rate": 1.4229915358197377e-05, + "loss": 0.8543, + "step": 10245 + }, + { + "epoch": 12.677798392084107, + "grad_norm": 0.427734375, + "learning_rate": 1.4155964760927176e-05, + "loss": 0.8553, + "step": 10250 + }, + { + "epoch": 12.683982683982684, + "grad_norm": 0.427734375, + "learning_rate": 1.4082192179864518e-05, + "loss": 0.8439, + "step": 10255 + }, + { + "epoch": 12.690166975881262, + "grad_norm": 0.421875, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.8558, + "step": 10260 + }, + { + "epoch": 12.69635126777984, + "grad_norm": 0.447265625, + "learning_rate": 1.3935181677926156e-05, + "loss": 0.858, + "step": 10265 + }, + { + "epoch": 12.702535559678417, + "grad_norm": 0.4453125, + "learning_rate": 1.3861944061908583e-05, + "loss": 0.8561, + "step": 10270 + }, + { + "epoch": 12.708719851576994, + "grad_norm": 0.47265625, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.8531, + "step": 10275 + }, + { + "epoch": 12.714904143475572, + "grad_norm": 0.44140625, + "learning_rate": 1.3716004859146592e-05, + "loss": 0.8465, + "step": 10280 + }, + { + "epoch": 12.72108843537415, + "grad_norm": 0.423828125, + "learning_rate": 1.3643303575038779e-05, + "loss": 0.8433, + "step": 10285 + }, + { + "epoch": 12.727272727272727, + "grad_norm": 0.4375, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.8566, + "step": 10290 + }, + { + "epoch": 12.733457019171304, + "grad_norm": 0.4296875, + "learning_rate": 1.3498438395178492e-05, + "loss": 0.8578, + "step": 10295 + }, + { + "epoch": 12.739641311069882, + "grad_norm": 0.423828125, + "learning_rate": 1.3426274799835337e-05, + "loss": 0.8606, + "step": 10300 + }, + { + "epoch": 12.745825602968461, + "grad_norm": 0.44140625, + "learning_rate": 1.3354290733869979e-05, + "loss": 0.8597, + "step": 10305 + }, + { + "epoch": 12.752009894867038, + "grad_norm": 0.41796875, + "learning_rate": 1.328248634655701e-05, + "loss": 0.8595, + "step": 10310 + }, + { + "epoch": 12.758194186765616, + "grad_norm": 0.439453125, + "learning_rate": 1.3210861786798335e-05, + "loss": 0.8615, + "step": 10315 + }, + { + "epoch": 12.764378478664193, + "grad_norm": 0.453125, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.8588, + "step": 10320 + }, + { + "epoch": 12.770562770562771, + "grad_norm": 0.43359375, + "learning_rate": 1.306815274368689e-05, + "loss": 0.8572, + "step": 10325 + }, + { + "epoch": 12.776747062461348, + "grad_norm": 0.423828125, + "learning_rate": 1.2997068556272263e-05, + "loss": 0.8521, + "step": 10330 + }, + { + "epoch": 12.782931354359926, + "grad_norm": 0.416015625, + "learning_rate": 1.2926164788287543e-05, + "loss": 0.852, + "step": 10335 + }, + { + "epoch": 12.789115646258503, + "grad_norm": 0.4453125, + "learning_rate": 1.2855441586767113e-05, + "loss": 0.8662, + "step": 10340 + }, + { + "epoch": 12.79529993815708, + "grad_norm": 0.44140625, + "learning_rate": 1.278489909837085e-05, + "loss": 0.8589, + "step": 10345 + }, + { + "epoch": 12.801484230055658, + "grad_norm": 0.423828125, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.8578, + "step": 10350 + }, + { + "epoch": 12.807668521954236, + "grad_norm": 0.443359375, + "learning_rate": 1.2644356845716287e-05, + "loss": 0.8596, + "step": 10355 + }, + { + "epoch": 12.813852813852813, + "grad_norm": 0.46484375, + "learning_rate": 1.2574357372902767e-05, + "loss": 0.8585, + "step": 10360 + }, + { + "epoch": 12.82003710575139, + "grad_norm": 0.443359375, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.8529, + "step": 10365 + }, + { + "epoch": 12.826221397649968, + "grad_norm": 0.41796875, + "learning_rate": 1.243490246009842e-05, + "loss": 0.8488, + "step": 10370 + }, + { + "epoch": 12.832405689548546, + "grad_norm": 0.447265625, + "learning_rate": 1.2365447309297529e-05, + "loss": 0.8562, + "step": 10375 + }, + { + "epoch": 12.838589981447125, + "grad_norm": 0.4375, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.8609, + "step": 10380 + }, + { + "epoch": 12.844774273345703, + "grad_norm": 0.412109375, + "learning_rate": 1.2227082339049612e-05, + "loss": 0.862, + "step": 10385 + }, + { + "epoch": 12.85095856524428, + "grad_norm": 0.4296875, + "learning_rate": 1.215817280653232e-05, + "loss": 0.8521, + "step": 10390 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 0.4375, + "learning_rate": 1.2089445433077073e-05, + "loss": 0.8468, + "step": 10395 + }, + { + "epoch": 12.863327149041435, + "grad_norm": 0.423828125, + "learning_rate": 1.2020900361204968e-05, + "loss": 0.855, + "step": 10400 + }, + { + "epoch": 12.869511440940013, + "grad_norm": 0.44140625, + "learning_rate": 1.19525377330591e-05, + "loss": 0.8517, + "step": 10405 + }, + { + "epoch": 12.87569573283859, + "grad_norm": 0.40625, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.8631, + "step": 10410 + }, + { + "epoch": 12.881880024737168, + "grad_norm": 0.4296875, + "learning_rate": 1.1816360374626245e-05, + "loss": 0.8458, + "step": 10415 + }, + { + "epoch": 12.888064316635745, + "grad_norm": 0.41015625, + "learning_rate": 1.1748545926732535e-05, + "loss": 0.8588, + "step": 10420 + }, + { + "epoch": 12.894248608534323, + "grad_norm": 0.408203125, + "learning_rate": 1.1680914487350959e-05, + "loss": 0.8574, + "step": 10425 + }, + { + "epoch": 12.9004329004329, + "grad_norm": 0.4296875, + "learning_rate": 1.1613466196729984e-05, + "loss": 0.8485, + "step": 10430 + }, + { + "epoch": 12.906617192331478, + "grad_norm": 0.4296875, + "learning_rate": 1.1546201194738227e-05, + "loss": 0.8718, + "step": 10435 + }, + { + "epoch": 12.912801484230055, + "grad_norm": 0.4296875, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.851, + "step": 10440 + }, + { + "epoch": 12.918985776128633, + "grad_norm": 0.419921875, + "learning_rate": 1.1412221614216278e-05, + "loss": 0.846, + "step": 10445 + }, + { + "epoch": 12.92517006802721, + "grad_norm": 0.416015625, + "learning_rate": 1.1345507313521786e-05, + "loss": 0.8575, + "step": 10450 + }, + { + "epoch": 12.931354359925788, + "grad_norm": 0.427734375, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.8535, + "step": 10455 + }, + { + "epoch": 12.937538651824367, + "grad_norm": 0.443359375, + "learning_rate": 1.1212630382998213e-05, + "loss": 0.8522, + "step": 10460 + }, + { + "epoch": 12.943722943722944, + "grad_norm": 0.439453125, + "learning_rate": 1.1146468028718237e-05, + "loss": 0.8622, + "step": 10465 + }, + { + "epoch": 12.949907235621522, + "grad_norm": 0.43359375, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.8574, + "step": 10470 + }, + { + "epoch": 12.9560915275201, + "grad_norm": 0.419921875, + "learning_rate": 1.101469622813157e-05, + "loss": 0.867, + "step": 10475 + }, + { + "epoch": 12.962275819418677, + "grad_norm": 0.439453125, + "learning_rate": 1.0949087055082252e-05, + "loss": 0.8508, + "step": 10480 + }, + { + "epoch": 12.968460111317254, + "grad_norm": 0.44140625, + "learning_rate": 1.0883662548396257e-05, + "loss": 0.8491, + "step": 10485 + }, + { + "epoch": 12.974644403215832, + "grad_norm": 0.4375, + "learning_rate": 1.0818422843745512e-05, + "loss": 0.8596, + "step": 10490 + }, + { + "epoch": 12.98082869511441, + "grad_norm": 0.431640625, + "learning_rate": 1.0753368076418647e-05, + "loss": 0.8611, + "step": 10495 + }, + { + "epoch": 12.987012987012987, + "grad_norm": 0.40625, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.8466, + "step": 10500 + }, + { + "epoch": 12.993197278911564, + "grad_norm": 0.431640625, + "learning_rate": 1.0623813892973455e-05, + "loss": 0.8582, + "step": 10505 + }, + { + "epoch": 12.999381570810142, + "grad_norm": 0.44140625, + "learning_rate": 1.0559314745513805e-05, + "loss": 0.8621, + "step": 10510 + }, + { + "epoch": 12.999381570810142, + "eval_loss": 2.520648717880249, + "eval_runtime": 0.6408, + "eval_samples_per_second": 15.605, + "eval_steps_per_second": 1.56, + "step": 10510 + }, + { + "epoch": 13.00556586270872, + "grad_norm": 0.439453125, + "learning_rate": 1.049500107269481e-05, + "loss": 0.8552, + "step": 10515 + }, + { + "epoch": 13.011750154607297, + "grad_norm": 0.4296875, + "learning_rate": 1.0430873007884857e-05, + "loss": 0.8558, + "step": 10520 + }, + { + "epoch": 13.017934446505874, + "grad_norm": 0.419921875, + "learning_rate": 1.0366930684067333e-05, + "loss": 0.8587, + "step": 10525 + }, + { + "epoch": 13.024118738404452, + "grad_norm": 0.4375, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.861, + "step": 10530 + }, + { + "epoch": 13.030303030303031, + "grad_norm": 0.427734375, + "learning_rate": 1.0239603789417274e-05, + "loss": 0.8619, + "step": 10535 + }, + { + "epoch": 13.036487322201609, + "grad_norm": 0.431640625, + "learning_rate": 1.0176219482624616e-05, + "loss": 0.8493, + "step": 10540 + }, + { + "epoch": 13.042671614100186, + "grad_norm": 0.4140625, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.8582, + "step": 10545 + }, + { + "epoch": 13.048855905998764, + "grad_norm": 0.42578125, + "learning_rate": 1.0050009807309325e-05, + "loss": 0.8486, + "step": 10550 + }, + { + "epoch": 13.055040197897341, + "grad_norm": 0.421875, + "learning_rate": 9.987184700509755e-06, + "loss": 0.8461, + "step": 10555 + }, + { + "epoch": 13.061224489795919, + "grad_norm": 0.427734375, + "learning_rate": 9.924546254786493e-06, + "loss": 0.8626, + "step": 10560 + }, + { + "epoch": 13.067408781694496, + "grad_norm": 0.412109375, + "learning_rate": 9.862094600033912e-06, + "loss": 0.861, + "step": 10565 + }, + { + "epoch": 13.073593073593074, + "grad_norm": 0.416015625, + "learning_rate": 9.799829865759069e-06, + "loss": 0.8576, + "step": 10570 + }, + { + "epoch": 13.079777365491651, + "grad_norm": 0.423828125, + "learning_rate": 9.737752181081338e-06, + "loss": 0.8471, + "step": 10575 + }, + { + "epoch": 13.085961657390229, + "grad_norm": 0.423828125, + "learning_rate": 9.675861674732312e-06, + "loss": 0.8518, + "step": 10580 + }, + { + "epoch": 13.092145949288806, + "grad_norm": 0.42578125, + "learning_rate": 9.614158475055302e-06, + "loss": 0.859, + "step": 10585 + }, + { + "epoch": 13.098330241187384, + "grad_norm": 0.439453125, + "learning_rate": 9.552642710005299e-06, + "loss": 0.8624, + "step": 10590 + }, + { + "epoch": 13.104514533085961, + "grad_norm": 0.427734375, + "learning_rate": 9.491314507148597e-06, + "loss": 0.8538, + "step": 10595 + }, + { + "epoch": 13.110698824984539, + "grad_norm": 0.421875, + "learning_rate": 9.430173993662451e-06, + "loss": 0.8567, + "step": 10600 + }, + { + "epoch": 13.116883116883116, + "grad_norm": 0.42578125, + "learning_rate": 9.369221296335006e-06, + "loss": 0.8544, + "step": 10605 + }, + { + "epoch": 13.123067408781694, + "grad_norm": 0.412109375, + "learning_rate": 9.308456541564881e-06, + "loss": 0.8461, + "step": 10610 + }, + { + "epoch": 13.129251700680273, + "grad_norm": 0.408203125, + "learning_rate": 9.24787985536094e-06, + "loss": 0.8612, + "step": 10615 + }, + { + "epoch": 13.13543599257885, + "grad_norm": 0.4140625, + "learning_rate": 9.187491363342093e-06, + "loss": 0.8727, + "step": 10620 + }, + { + "epoch": 13.141620284477428, + "grad_norm": 0.42578125, + "learning_rate": 9.127291190736943e-06, + "loss": 0.8599, + "step": 10625 + }, + { + "epoch": 13.147804576376005, + "grad_norm": 0.408203125, + "learning_rate": 9.067279462383615e-06, + "loss": 0.8478, + "step": 10630 + }, + { + "epoch": 13.153988868274583, + "grad_norm": 0.427734375, + "learning_rate": 9.0074563027294e-06, + "loss": 0.8527, + "step": 10635 + }, + { + "epoch": 13.16017316017316, + "grad_norm": 0.4140625, + "learning_rate": 8.947821835830616e-06, + "loss": 0.8508, + "step": 10640 + }, + { + "epoch": 13.166357452071738, + "grad_norm": 0.447265625, + "learning_rate": 8.88837618535222e-06, + "loss": 0.8556, + "step": 10645 + }, + { + "epoch": 13.172541743970315, + "grad_norm": 0.44140625, + "learning_rate": 8.829119474567671e-06, + "loss": 0.8484, + "step": 10650 + }, + { + "epoch": 13.178726035868893, + "grad_norm": 0.4140625, + "learning_rate": 8.770051826358594e-06, + "loss": 0.8635, + "step": 10655 + }, + { + "epoch": 13.18491032776747, + "grad_norm": 0.44140625, + "learning_rate": 8.711173363214553e-06, + "loss": 0.849, + "step": 10660 + }, + { + "epoch": 13.191094619666048, + "grad_norm": 0.421875, + "learning_rate": 8.652484207232803e-06, + "loss": 0.855, + "step": 10665 + }, + { + "epoch": 13.197278911564625, + "grad_norm": 0.4296875, + "learning_rate": 8.593984480118011e-06, + "loss": 0.8637, + "step": 10670 + }, + { + "epoch": 13.203463203463203, + "grad_norm": 0.419921875, + "learning_rate": 8.535674303182061e-06, + "loss": 0.855, + "step": 10675 + }, + { + "epoch": 13.20964749536178, + "grad_norm": 0.427734375, + "learning_rate": 8.47755379734373e-06, + "loss": 0.8649, + "step": 10680 + }, + { + "epoch": 13.215831787260358, + "grad_norm": 0.42578125, + "learning_rate": 8.419623083128458e-06, + "loss": 0.8694, + "step": 10685 + }, + { + "epoch": 13.222016079158937, + "grad_norm": 0.431640625, + "learning_rate": 8.361882280668165e-06, + "loss": 0.8572, + "step": 10690 + }, + { + "epoch": 13.228200371057515, + "grad_norm": 0.416015625, + "learning_rate": 8.304331509700891e-06, + "loss": 0.8529, + "step": 10695 + }, + { + "epoch": 13.234384662956092, + "grad_norm": 0.439453125, + "learning_rate": 8.24697088957066e-06, + "loss": 0.863, + "step": 10700 + }, + { + "epoch": 13.24056895485467, + "grad_norm": 0.41015625, + "learning_rate": 8.189800539227111e-06, + "loss": 0.86, + "step": 10705 + }, + { + "epoch": 13.246753246753247, + "grad_norm": 0.4140625, + "learning_rate": 8.132820577225387e-06, + "loss": 0.8673, + "step": 10710 + }, + { + "epoch": 13.252937538651825, + "grad_norm": 0.44140625, + "learning_rate": 8.076031121725746e-06, + "loss": 0.8537, + "step": 10715 + }, + { + "epoch": 13.259121830550402, + "grad_norm": 0.421875, + "learning_rate": 8.019432290493457e-06, + "loss": 0.8516, + "step": 10720 + }, + { + "epoch": 13.26530612244898, + "grad_norm": 0.431640625, + "learning_rate": 7.963024200898462e-06, + "loss": 0.859, + "step": 10725 + }, + { + "epoch": 13.271490414347557, + "grad_norm": 0.41796875, + "learning_rate": 7.906806969915148e-06, + "loss": 0.8595, + "step": 10730 + }, + { + "epoch": 13.277674706246135, + "grad_norm": 0.431640625, + "learning_rate": 7.85078071412213e-06, + "loss": 0.8483, + "step": 10735 + }, + { + "epoch": 13.283858998144712, + "grad_norm": 0.439453125, + "learning_rate": 7.794945549701993e-06, + "loss": 0.8498, + "step": 10740 + }, + { + "epoch": 13.29004329004329, + "grad_norm": 0.423828125, + "learning_rate": 7.739301592441017e-06, + "loss": 0.859, + "step": 10745 + }, + { + "epoch": 13.296227581941867, + "grad_norm": 0.412109375, + "learning_rate": 7.683848957729056e-06, + "loss": 0.8533, + "step": 10750 + }, + { + "epoch": 13.302411873840445, + "grad_norm": 0.421875, + "learning_rate": 7.6285877605591135e-06, + "loss": 0.8526, + "step": 10755 + }, + { + "epoch": 13.308596165739022, + "grad_norm": 0.421875, + "learning_rate": 7.573518115527289e-06, + "loss": 0.8536, + "step": 10760 + }, + { + "epoch": 13.3147804576376, + "grad_norm": 0.435546875, + "learning_rate": 7.5186401368324e-06, + "loss": 0.8591, + "step": 10765 + }, + { + "epoch": 13.320964749536179, + "grad_norm": 0.41796875, + "learning_rate": 7.463953938275858e-06, + "loss": 0.8542, + "step": 10770 + }, + { + "epoch": 13.327149041434756, + "grad_norm": 0.412109375, + "learning_rate": 7.409459633261307e-06, + "loss": 0.8495, + "step": 10775 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 0.435546875, + "learning_rate": 7.355157334794516e-06, + "loss": 0.8547, + "step": 10780 + }, + { + "epoch": 13.339517625231911, + "grad_norm": 0.431640625, + "learning_rate": 7.3010471554830766e-06, + "loss": 0.8581, + "step": 10785 + }, + { + "epoch": 13.345701917130489, + "grad_norm": 0.451171875, + "learning_rate": 7.247129207536152e-06, + "loss": 0.8583, + "step": 10790 + }, + { + "epoch": 13.351886209029066, + "grad_norm": 0.4375, + "learning_rate": 7.193403602764315e-06, + "loss": 0.8665, + "step": 10795 + }, + { + "epoch": 13.358070500927644, + "grad_norm": 0.443359375, + "learning_rate": 7.1398704525792e-06, + "loss": 0.8528, + "step": 10800 + }, + { + "epoch": 13.364254792826221, + "grad_norm": 0.431640625, + "learning_rate": 7.086529867993453e-06, + "loss": 0.8538, + "step": 10805 + }, + { + "epoch": 13.370439084724799, + "grad_norm": 0.41015625, + "learning_rate": 7.0333819596203e-06, + "loss": 0.8579, + "step": 10810 + }, + { + "epoch": 13.376623376623376, + "grad_norm": 0.435546875, + "learning_rate": 6.980426837673437e-06, + "loss": 0.8534, + "step": 10815 + }, + { + "epoch": 13.382807668521954, + "grad_norm": 0.458984375, + "learning_rate": 6.927664611966811e-06, + "loss": 0.8529, + "step": 10820 + }, + { + "epoch": 13.388991960420531, + "grad_norm": 0.412109375, + "learning_rate": 6.875095391914299e-06, + "loss": 0.8577, + "step": 10825 + }, + { + "epoch": 13.395176252319109, + "grad_norm": 0.4296875, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.8588, + "step": 10830 + }, + { + "epoch": 13.401360544217686, + "grad_norm": 0.431640625, + "learning_rate": 6.770536404425887e-06, + "loss": 0.8484, + "step": 10835 + }, + { + "epoch": 13.407544836116264, + "grad_norm": 0.431640625, + "learning_rate": 6.718546853815688e-06, + "loss": 0.8584, + "step": 10840 + }, + { + "epoch": 13.413729128014843, + "grad_norm": 0.423828125, + "learning_rate": 6.666750742510619e-06, + "loss": 0.8484, + "step": 10845 + }, + { + "epoch": 13.41991341991342, + "grad_norm": 0.423828125, + "learning_rate": 6.6151481779211155e-06, + "loss": 0.8478, + "step": 10850 + }, + { + "epoch": 13.426097711811998, + "grad_norm": 0.421875, + "learning_rate": 6.56373926705629e-06, + "loss": 0.8502, + "step": 10855 + }, + { + "epoch": 13.432282003710576, + "grad_norm": 0.419921875, + "learning_rate": 6.512524116523633e-06, + "loss": 0.8595, + "step": 10860 + }, + { + "epoch": 13.438466295609153, + "grad_norm": 0.439453125, + "learning_rate": 6.461502832528887e-06, + "loss": 0.8675, + "step": 10865 + }, + { + "epoch": 13.44465058750773, + "grad_norm": 0.435546875, + "learning_rate": 6.410675520875742e-06, + "loss": 0.8641, + "step": 10870 + }, + { + "epoch": 13.450834879406308, + "grad_norm": 0.42578125, + "learning_rate": 6.360042286965595e-06, + "loss": 0.8618, + "step": 10875 + }, + { + "epoch": 13.457019171304886, + "grad_norm": 0.4609375, + "learning_rate": 6.30960323579749e-06, + "loss": 0.8606, + "step": 10880 + }, + { + "epoch": 13.463203463203463, + "grad_norm": 0.43359375, + "learning_rate": 6.2593584719676805e-06, + "loss": 0.8486, + "step": 10885 + }, + { + "epoch": 13.46938775510204, + "grad_norm": 0.431640625, + "learning_rate": 6.209308099669597e-06, + "loss": 0.855, + "step": 10890 + }, + { + "epoch": 13.475572047000618, + "grad_norm": 0.427734375, + "learning_rate": 6.159452222693507e-06, + "loss": 0.8553, + "step": 10895 + }, + { + "epoch": 13.481756338899196, + "grad_norm": 0.431640625, + "learning_rate": 6.109790944426397e-06, + "loss": 0.8564, + "step": 10900 + }, + { + "epoch": 13.487940630797773, + "grad_norm": 0.421875, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.8541, + "step": 10905 + }, + { + "epoch": 13.49412492269635, + "grad_norm": 0.40234375, + "learning_rate": 6.011052595549038e-06, + "loss": 0.8528, + "step": 10910 + }, + { + "epoch": 13.500309214594928, + "grad_norm": 0.427734375, + "learning_rate": 5.961975729694158e-06, + "loss": 0.8538, + "step": 10915 + }, + { + "epoch": 13.506493506493506, + "grad_norm": 0.42578125, + "learning_rate": 5.913093872058528e-06, + "loss": 0.8546, + "step": 10920 + }, + { + "epoch": 13.512677798392085, + "grad_norm": 0.427734375, + "learning_rate": 5.864407124009297e-06, + "loss": 0.8573, + "step": 10925 + }, + { + "epoch": 13.518862090290662, + "grad_norm": 0.42578125, + "learning_rate": 5.81591558650898e-06, + "loss": 0.8529, + "step": 10930 + }, + { + "epoch": 13.52504638218924, + "grad_norm": 0.416015625, + "learning_rate": 5.767619360115295e-06, + "loss": 0.8528, + "step": 10935 + }, + { + "epoch": 13.531230674087817, + "grad_norm": 0.443359375, + "learning_rate": 5.719518544980929e-06, + "loss": 0.8482, + "step": 10940 + }, + { + "epoch": 13.537414965986395, + "grad_norm": 0.435546875, + "learning_rate": 5.6716132408533355e-06, + "loss": 0.852, + "step": 10945 + }, + { + "epoch": 13.543599257884972, + "grad_norm": 0.443359375, + "learning_rate": 5.623903547074549e-06, + "loss": 0.8517, + "step": 10950 + }, + { + "epoch": 13.54978354978355, + "grad_norm": 0.41796875, + "learning_rate": 5.5763895625809415e-06, + "loss": 0.8595, + "step": 10955 + }, + { + "epoch": 13.555967841682127, + "grad_norm": 0.451171875, + "learning_rate": 5.529071385903084e-06, + "loss": 0.8475, + "step": 10960 + }, + { + "epoch": 13.562152133580705, + "grad_norm": 0.416015625, + "learning_rate": 5.481949115165452e-06, + "loss": 0.8665, + "step": 10965 + }, + { + "epoch": 13.568336425479282, + "grad_norm": 0.443359375, + "learning_rate": 5.43502284808628e-06, + "loss": 0.8565, + "step": 10970 + }, + { + "epoch": 13.57452071737786, + "grad_norm": 0.435546875, + "learning_rate": 5.38829268197738e-06, + "loss": 0.8591, + "step": 10975 + }, + { + "epoch": 13.580705009276437, + "grad_norm": 0.447265625, + "learning_rate": 5.341758713743828e-06, + "loss": 0.8511, + "step": 10980 + }, + { + "epoch": 13.586889301175015, + "grad_norm": 0.443359375, + "learning_rate": 5.295421039883941e-06, + "loss": 0.8608, + "step": 10985 + }, + { + "epoch": 13.593073593073592, + "grad_norm": 0.4609375, + "learning_rate": 5.249279756488878e-06, + "loss": 0.8418, + "step": 10990 + }, + { + "epoch": 13.59925788497217, + "grad_norm": 0.431640625, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.8517, + "step": 10995 + }, + { + "epoch": 13.60544217687075, + "grad_norm": 0.443359375, + "learning_rate": 5.157586743421672e-06, + "loss": 0.8641, + "step": 11000 + }, + { + "epoch": 13.611626468769327, + "grad_norm": 0.451171875, + "learning_rate": 5.112035203894827e-06, + "loss": 0.846, + "step": 11005 + }, + { + "epoch": 13.617810760667904, + "grad_norm": 0.439453125, + "learning_rate": 5.066680435123106e-06, + "loss": 0.8578, + "step": 11010 + }, + { + "epoch": 13.623995052566482, + "grad_norm": 0.447265625, + "learning_rate": 5.021522531159428e-06, + "loss": 0.8662, + "step": 11015 + }, + { + "epoch": 13.63017934446506, + "grad_norm": 0.435546875, + "learning_rate": 4.976561585648509e-06, + "loss": 0.8479, + "step": 11020 + }, + { + "epoch": 13.636363636363637, + "grad_norm": 0.443359375, + "learning_rate": 4.931797691826601e-06, + "loss": 0.8492, + "step": 11025 + }, + { + "epoch": 13.642547928262214, + "grad_norm": 0.423828125, + "learning_rate": 4.887230942521337e-06, + "loss": 0.8523, + "step": 11030 + }, + { + "epoch": 13.648732220160792, + "grad_norm": 0.44921875, + "learning_rate": 4.842861430151557e-06, + "loss": 0.8563, + "step": 11035 + }, + { + "epoch": 13.65491651205937, + "grad_norm": 0.416015625, + "learning_rate": 4.798689246727006e-06, + "loss": 0.8535, + "step": 11040 + }, + { + "epoch": 13.661100803957947, + "grad_norm": 0.416015625, + "learning_rate": 4.754714483848333e-06, + "loss": 0.8502, + "step": 11045 + }, + { + "epoch": 13.667285095856524, + "grad_norm": 0.4375, + "learning_rate": 4.710937232706691e-06, + "loss": 0.8562, + "step": 11050 + }, + { + "epoch": 13.673469387755102, + "grad_norm": 0.427734375, + "learning_rate": 4.667357584083721e-06, + "loss": 0.8523, + "step": 11055 + }, + { + "epoch": 13.67965367965368, + "grad_norm": 0.427734375, + "learning_rate": 4.623975628351273e-06, + "loss": 0.8623, + "step": 11060 + }, + { + "epoch": 13.685837971552257, + "grad_norm": 0.4140625, + "learning_rate": 4.5807914554712005e-06, + "loss": 0.8535, + "step": 11065 + }, + { + "epoch": 13.692022263450834, + "grad_norm": 0.4296875, + "learning_rate": 4.537805154995278e-06, + "loss": 0.8433, + "step": 11070 + }, + { + "epoch": 13.698206555349412, + "grad_norm": 0.419921875, + "learning_rate": 4.49501681606489e-06, + "loss": 0.8533, + "step": 11075 + }, + { + "epoch": 13.70439084724799, + "grad_norm": 0.419921875, + "learning_rate": 4.452426527410947e-06, + "loss": 0.851, + "step": 11080 + }, + { + "epoch": 13.710575139146568, + "grad_norm": 0.423828125, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.8634, + "step": 11085 + }, + { + "epoch": 13.716759431045146, + "grad_norm": 0.431640625, + "learning_rate": 4.36784045380223e-06, + "loss": 0.8549, + "step": 11090 + }, + { + "epoch": 13.722943722943723, + "grad_norm": 0.435546875, + "learning_rate": 4.325844844255023e-06, + "loss": 0.8541, + "step": 11095 + }, + { + "epoch": 13.729128014842301, + "grad_norm": 0.42578125, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.8611, + "step": 11100 + }, + { + "epoch": 13.735312306740878, + "grad_norm": 0.412109375, + "learning_rate": 4.242448915109698e-06, + "loss": 0.8532, + "step": 11105 + }, + { + "epoch": 13.741496598639456, + "grad_norm": 0.423828125, + "learning_rate": 4.20104876845111e-06, + "loss": 0.8538, + "step": 11110 + }, + { + "epoch": 13.747680890538033, + "grad_norm": 0.412109375, + "learning_rate": 4.159847281675411e-06, + "loss": 0.8525, + "step": 11115 + }, + { + "epoch": 13.753865182436611, + "grad_norm": 0.412109375, + "learning_rate": 4.118844540222788e-06, + "loss": 0.8455, + "step": 11120 + }, + { + "epoch": 13.760049474335188, + "grad_norm": 0.412109375, + "learning_rate": 4.078040629121327e-06, + "loss": 0.8592, + "step": 11125 + }, + { + "epoch": 13.766233766233766, + "grad_norm": 0.439453125, + "learning_rate": 4.037435632986786e-06, + "loss": 0.86, + "step": 11130 + }, + { + "epoch": 13.772418058132343, + "grad_norm": 0.439453125, + "learning_rate": 3.997029636022387e-06, + "loss": 0.8488, + "step": 11135 + }, + { + "epoch": 13.778602350030921, + "grad_norm": 0.4453125, + "learning_rate": 3.95682272201876e-06, + "loss": 0.8485, + "step": 11140 + }, + { + "epoch": 13.784786641929498, + "grad_norm": 0.40625, + "learning_rate": 3.916814974353633e-06, + "loss": 0.8525, + "step": 11145 + }, + { + "epoch": 13.790970933828076, + "grad_norm": 0.4296875, + "learning_rate": 3.877006475991729e-06, + "loss": 0.8634, + "step": 11150 + }, + { + "epoch": 13.797155225726655, + "grad_norm": 0.427734375, + "learning_rate": 3.837397309484636e-06, + "loss": 0.8615, + "step": 11155 + }, + { + "epoch": 13.803339517625233, + "grad_norm": 0.44140625, + "learning_rate": 3.797987556970495e-06, + "loss": 0.8533, + "step": 11160 + }, + { + "epoch": 13.80952380952381, + "grad_norm": 0.416015625, + "learning_rate": 3.75877730017401e-06, + "loss": 0.8519, + "step": 11165 + }, + { + "epoch": 13.815708101422388, + "grad_norm": 0.443359375, + "learning_rate": 3.7197666204060955e-06, + "loss": 0.8673, + "step": 11170 + }, + { + "epoch": 13.821892393320965, + "grad_norm": 0.4140625, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.8561, + "step": 11175 + }, + { + "epoch": 13.828076685219543, + "grad_norm": 0.447265625, + "learning_rate": 3.6423443151304526e-06, + "loss": 0.8519, + "step": 11180 + }, + { + "epoch": 13.83426097711812, + "grad_norm": 0.4375, + "learning_rate": 3.6039328501746293e-06, + "loss": 0.8641, + "step": 11185 + }, + { + "epoch": 13.840445269016698, + "grad_norm": 0.41796875, + "learning_rate": 3.565721283350931e-06, + "loss": 0.8552, + "step": 11190 + }, + { + "epoch": 13.846629560915275, + "grad_norm": 0.419921875, + "learning_rate": 3.527709693899306e-06, + "loss": 0.8747, + "step": 11195 + }, + { + "epoch": 13.852813852813853, + "grad_norm": 0.427734375, + "learning_rate": 3.4898981606450333e-06, + "loss": 0.8507, + "step": 11200 + }, + { + "epoch": 13.85899814471243, + "grad_norm": 0.44921875, + "learning_rate": 3.452286761998491e-06, + "loss": 0.8636, + "step": 11205 + }, + { + "epoch": 13.865182436611008, + "grad_norm": 0.431640625, + "learning_rate": 3.414875575955101e-06, + "loss": 0.8498, + "step": 11210 + }, + { + "epoch": 13.871366728509585, + "grad_norm": 0.416015625, + "learning_rate": 3.3776646800950605e-06, + "loss": 0.8534, + "step": 11215 + }, + { + "epoch": 13.877551020408163, + "grad_norm": 0.427734375, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.8435, + "step": 11220 + }, + { + "epoch": 13.88373531230674, + "grad_norm": 0.4296875, + "learning_rate": 3.303844067168904e-06, + "loss": 0.8524, + "step": 11225 + }, + { + "epoch": 13.889919604205318, + "grad_norm": 0.4375, + "learning_rate": 3.267234503185823e-06, + "loss": 0.8505, + "step": 11230 + }, + { + "epoch": 13.896103896103895, + "grad_norm": 0.4296875, + "learning_rate": 3.2308255355518403e-06, + "loss": 0.8557, + "step": 11235 + }, + { + "epoch": 13.902288188002474, + "grad_norm": 0.439453125, + "learning_rate": 3.1946172397688267e-06, + "loss": 0.8562, + "step": 11240 + }, + { + "epoch": 13.908472479901052, + "grad_norm": 0.427734375, + "learning_rate": 3.158609690922554e-06, + "loss": 0.8555, + "step": 11245 + }, + { + "epoch": 13.91465677179963, + "grad_norm": 0.416015625, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.8618, + "step": 11250 + }, + { + "epoch": 13.920841063698207, + "grad_norm": 0.416015625, + "learning_rate": 3.0871971323015336e-06, + "loss": 0.8604, + "step": 11255 + }, + { + "epoch": 13.927025355596784, + "grad_norm": 0.41015625, + "learning_rate": 3.051792270616216e-06, + "loss": 0.8598, + "step": 11260 + }, + { + "epoch": 13.933209647495362, + "grad_norm": 0.419921875, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.8531, + "step": 11265 + }, + { + "epoch": 13.93939393939394, + "grad_norm": 0.43359375, + "learning_rate": 2.981585749594051e-06, + "loss": 0.8508, + "step": 11270 + }, + { + "epoch": 13.945578231292517, + "grad_norm": 0.439453125, + "learning_rate": 2.9467842358456345e-06, + "loss": 0.8466, + "step": 11275 + }, + { + "epoch": 13.951762523191094, + "grad_norm": 0.44140625, + "learning_rate": 2.912183982969385e-06, + "loss": 0.8548, + "step": 11280 + }, + { + "epoch": 13.957946815089672, + "grad_norm": 0.43359375, + "learning_rate": 2.8777850627164205e-06, + "loss": 0.8616, + "step": 11285 + }, + { + "epoch": 13.96413110698825, + "grad_norm": 0.41796875, + "learning_rate": 2.8435875464203343e-06, + "loss": 0.8578, + "step": 11290 + }, + { + "epoch": 13.970315398886827, + "grad_norm": 0.423828125, + "learning_rate": 2.809591504997111e-06, + "loss": 0.8674, + "step": 11295 + }, + { + "epoch": 13.976499690785404, + "grad_norm": 0.42578125, + "learning_rate": 2.7757970089449024e-06, + "loss": 0.8573, + "step": 11300 + }, + { + "epoch": 13.982683982683982, + "grad_norm": 0.44921875, + "learning_rate": 2.742204128343917e-06, + "loss": 0.8582, + "step": 11305 + }, + { + "epoch": 13.988868274582561, + "grad_norm": 0.423828125, + "learning_rate": 2.708812932856253e-06, + "loss": 0.8554, + "step": 11310 + }, + { + "epoch": 13.995052566481139, + "grad_norm": 0.416015625, + "learning_rate": 2.6756234917258205e-06, + "loss": 0.8586, + "step": 11315 + }, + { + "epoch": 14.0, + "eval_loss": 2.5210444927215576, + "eval_runtime": 0.5394, + "eval_samples_per_second": 18.54, + "eval_steps_per_second": 1.854, + "step": 11319 + }, + { + "epoch": 14.001236858379716, + "grad_norm": 0.43359375, + "learning_rate": 2.6426358737781098e-06, + "loss": 0.8497, + "step": 11320 + }, + { + "epoch": 14.007421150278294, + "grad_norm": 0.44921875, + "learning_rate": 2.6098501474200787e-06, + "loss": 0.8456, + "step": 11325 + }, + { + "epoch": 14.013605442176871, + "grad_norm": 0.4375, + "learning_rate": 2.577266380640053e-06, + "loss": 0.8513, + "step": 11330 + }, + { + "epoch": 14.019789734075449, + "grad_norm": 0.439453125, + "learning_rate": 2.5448846410075166e-06, + "loss": 0.8512, + "step": 11335 + }, + { + "epoch": 14.025974025974026, + "grad_norm": 0.423828125, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.8564, + "step": 11340 + }, + { + "epoch": 14.032158317872604, + "grad_norm": 0.421875, + "learning_rate": 2.480727511368064e-06, + "loss": 0.8522, + "step": 11345 + }, + { + "epoch": 14.038342609771181, + "grad_norm": 0.431640625, + "learning_rate": 2.448952254404846e-06, + "loss": 0.8645, + "step": 11350 + }, + { + "epoch": 14.044526901669759, + "grad_norm": 0.4296875, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.8509, + "step": 11355 + }, + { + "epoch": 14.050711193568336, + "grad_norm": 0.42578125, + "learning_rate": 2.3860086856557383e-06, + "loss": 0.857, + "step": 11360 + }, + { + "epoch": 14.056895485466914, + "grad_norm": 0.419921875, + "learning_rate": 2.35484050439696e-06, + "loss": 0.8457, + "step": 11365 + }, + { + "epoch": 14.063079777365491, + "grad_norm": 0.419921875, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.8502, + "step": 11370 + }, + { + "epoch": 14.069264069264069, + "grad_norm": 0.421875, + "learning_rate": 2.293111671280712e-06, + "loss": 0.8608, + "step": 11375 + }, + { + "epoch": 14.075448361162646, + "grad_norm": 0.4375, + "learning_rate": 2.2625511474313685e-06, + "loss": 0.851, + "step": 11380 + }, + { + "epoch": 14.081632653061224, + "grad_norm": 0.439453125, + "learning_rate": 2.232193303359742e-06, + "loss": 0.8599, + "step": 11385 + }, + { + "epoch": 14.087816944959803, + "grad_norm": 0.431640625, + "learning_rate": 2.2020382020194074e-06, + "loss": 0.8584, + "step": 11390 + }, + { + "epoch": 14.09400123685838, + "grad_norm": 0.42578125, + "learning_rate": 2.1720859059434993e-06, + "loss": 0.8594, + "step": 11395 + }, + { + "epoch": 14.100185528756958, + "grad_norm": 0.447265625, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.8604, + "step": 11400 + }, + { + "epoch": 14.106369820655535, + "grad_norm": 0.412109375, + "learning_rate": 2.112789977614582e-06, + "loss": 0.8407, + "step": 11405 + }, + { + "epoch": 14.112554112554113, + "grad_norm": 0.419921875, + "learning_rate": 2.0834464683245346e-06, + "loss": 0.8531, + "step": 11410 + }, + { + "epoch": 14.11873840445269, + "grad_norm": 0.419921875, + "learning_rate": 2.0543060102245717e-06, + "loss": 0.8524, + "step": 11415 + }, + { + "epoch": 14.124922696351268, + "grad_norm": 0.4296875, + "learning_rate": 2.025368663743743e-06, + "loss": 0.8522, + "step": 11420 + }, + { + "epoch": 14.131106988249845, + "grad_norm": 0.4140625, + "learning_rate": 1.9966344888899147e-06, + "loss": 0.8648, + "step": 11425 + }, + { + "epoch": 14.137291280148423, + "grad_norm": 0.447265625, + "learning_rate": 1.968103545249611e-06, + "loss": 0.8546, + "step": 11430 + }, + { + "epoch": 14.143475572047, + "grad_norm": 0.4296875, + "learning_rate": 1.9397758919879495e-06, + "loss": 0.8616, + "step": 11435 + }, + { + "epoch": 14.149659863945578, + "grad_norm": 0.408203125, + "learning_rate": 1.91165158784844e-06, + "loss": 0.8539, + "step": 11440 + }, + { + "epoch": 14.155844155844155, + "grad_norm": 0.421875, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.8599, + "step": 11445 + }, + { + "epoch": 14.162028447742733, + "grad_norm": 0.423828125, + "learning_rate": 1.8560132598014368e-06, + "loss": 0.8608, + "step": 11450 + }, + { + "epoch": 14.16821273964131, + "grad_norm": 0.419921875, + "learning_rate": 1.8284993512720505e-06, + "loss": 0.8558, + "step": 11455 + }, + { + "epoch": 14.174397031539888, + "grad_norm": 0.4375, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.8564, + "step": 11460 + }, + { + "epoch": 14.180581323438465, + "grad_norm": 0.42578125, + "learning_rate": 1.7740823304817188e-06, + "loss": 0.8581, + "step": 11465 + }, + { + "epoch": 14.186765615337045, + "grad_norm": 0.41796875, + "learning_rate": 1.7471793310662287e-06, + "loss": 0.866, + "step": 11470 + }, + { + "epoch": 14.192949907235622, + "grad_norm": 0.4296875, + "learning_rate": 1.7204800801636e-06, + "loss": 0.8628, + "step": 11475 + }, + { + "epoch": 14.1991341991342, + "grad_norm": 0.419921875, + "learning_rate": 1.6939846331405108e-06, + "loss": 0.8594, + "step": 11480 + }, + { + "epoch": 14.205318491032777, + "grad_norm": 0.41796875, + "learning_rate": 1.6676930449410099e-06, + "loss": 0.855, + "step": 11485 + }, + { + "epoch": 14.211502782931355, + "grad_norm": 0.41796875, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.8688, + "step": 11490 + }, + { + "epoch": 14.217687074829932, + "grad_norm": 0.42578125, + "learning_rate": 1.6157216626751292e-06, + "loss": 0.8607, + "step": 11495 + }, + { + "epoch": 14.22387136672851, + "grad_norm": 0.41796875, + "learning_rate": 1.5900419763826614e-06, + "loss": 0.8577, + "step": 11500 + }, + { + "epoch": 14.230055658627087, + "grad_norm": 0.427734375, + "learning_rate": 1.5645663644614172e-06, + "loss": 0.8483, + "step": 11505 + }, + { + "epoch": 14.236239950525665, + "grad_norm": 0.435546875, + "learning_rate": 1.5392948797405827e-06, + "loss": 0.8501, + "step": 11510 + }, + { + "epoch": 14.242424242424242, + "grad_norm": 0.427734375, + "learning_rate": 1.5142275746260593e-06, + "loss": 0.8419, + "step": 11515 + }, + { + "epoch": 14.24860853432282, + "grad_norm": 0.431640625, + "learning_rate": 1.489364501100332e-06, + "loss": 0.8652, + "step": 11520 + }, + { + "epoch": 14.254792826221397, + "grad_norm": 0.41796875, + "learning_rate": 1.4647057107223583e-06, + "loss": 0.8626, + "step": 11525 + }, + { + "epoch": 14.260977118119975, + "grad_norm": 0.43359375, + "learning_rate": 1.4402512546275114e-06, + "loss": 0.8516, + "step": 11530 + }, + { + "epoch": 14.267161410018552, + "grad_norm": 0.423828125, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.8496, + "step": 11535 + }, + { + "epoch": 14.27334570191713, + "grad_norm": 0.439453125, + "learning_rate": 1.3919555477097668e-06, + "loss": 0.8558, + "step": 11540 + }, + { + "epoch": 14.279529993815707, + "grad_norm": 0.421875, + "learning_rate": 1.3681143970385003e-06, + "loss": 0.8568, + "step": 11545 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 0.455078125, + "learning_rate": 1.344477780953346e-06, + "loss": 0.8614, + "step": 11550 + }, + { + "epoch": 14.291898577612864, + "grad_norm": 0.4375, + "learning_rate": 1.3210457484699733e-06, + "loss": 0.8518, + "step": 11555 + }, + { + "epoch": 14.298082869511441, + "grad_norm": 0.4375, + "learning_rate": 1.2978183481797801e-06, + "loss": 0.8459, + "step": 11560 + }, + { + "epoch": 14.304267161410019, + "grad_norm": 0.451171875, + "learning_rate": 1.274795628249792e-06, + "loss": 0.8547, + "step": 11565 + }, + { + "epoch": 14.310451453308596, + "grad_norm": 0.4375, + "learning_rate": 1.251977636422641e-06, + "loss": 0.8592, + "step": 11570 + }, + { + "epoch": 14.316635745207174, + "grad_norm": 0.41796875, + "learning_rate": 1.2293644200163544e-06, + "loss": 0.8447, + "step": 11575 + }, + { + "epoch": 14.322820037105751, + "grad_norm": 0.416015625, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.8548, + "step": 11580 + }, + { + "epoch": 14.329004329004329, + "grad_norm": 0.44140625, + "learning_rate": 1.1847525006152493e-06, + "loss": 0.8512, + "step": 11585 + }, + { + "epoch": 14.335188620902906, + "grad_norm": 0.421875, + "learning_rate": 1.1627538901329172e-06, + "loss": 0.856, + "step": 11590 + }, + { + "epoch": 14.341372912801484, + "grad_norm": 0.43359375, + "learning_rate": 1.1409602400962227e-06, + "loss": 0.8518, + "step": 11595 + }, + { + "epoch": 14.347557204700061, + "grad_norm": 0.41796875, + "learning_rate": 1.1193715956990258e-06, + "loss": 0.8417, + "step": 11600 + }, + { + "epoch": 14.353741496598639, + "grad_norm": 0.42578125, + "learning_rate": 1.0979880017100596e-06, + "loss": 0.861, + "step": 11605 + }, + { + "epoch": 14.359925788497216, + "grad_norm": 0.435546875, + "learning_rate": 1.076809502472831e-06, + "loss": 0.8562, + "step": 11610 + }, + { + "epoch": 14.366110080395794, + "grad_norm": 0.5078125, + "learning_rate": 1.055836141905553e-06, + "loss": 0.8512, + "step": 11615 + }, + { + "epoch": 14.372294372294371, + "grad_norm": 0.421875, + "learning_rate": 1.035067963501024e-06, + "loss": 0.873, + "step": 11620 + }, + { + "epoch": 14.37847866419295, + "grad_norm": 0.423828125, + "learning_rate": 1.014505010326583e-06, + "loss": 0.8564, + "step": 11625 + }, + { + "epoch": 14.384662956091528, + "grad_norm": 0.43359375, + "learning_rate": 9.94147325023953e-07, + "loss": 0.8537, + "step": 11630 + }, + { + "epoch": 14.390847247990106, + "grad_norm": 0.421875, + "learning_rate": 9.739949498091982e-07, + "loss": 0.8618, + "step": 11635 + }, + { + "epoch": 14.397031539888683, + "grad_norm": 0.423828125, + "learning_rate": 9.540479264726676e-07, + "loss": 0.8567, + "step": 11640 + }, + { + "epoch": 14.40321583178726, + "grad_norm": 0.439453125, + "learning_rate": 9.343062963787952e-07, + "loss": 0.8583, + "step": 11645 + }, + { + "epoch": 14.409400123685838, + "grad_norm": 0.427734375, + "learning_rate": 9.147701004661446e-07, + "loss": 0.8569, + "step": 11650 + }, + { + "epoch": 14.415584415584416, + "grad_norm": 0.427734375, + "learning_rate": 8.954393792472649e-07, + "loss": 0.8431, + "step": 11655 + }, + { + "epoch": 14.421768707482993, + "grad_norm": 0.419921875, + "learning_rate": 8.763141728085789e-07, + "loss": 0.856, + "step": 11660 + }, + { + "epoch": 14.42795299938157, + "grad_norm": 0.41796875, + "learning_rate": 8.573945208103618e-07, + "loss": 0.8554, + "step": 11665 + }, + { + "epoch": 14.434137291280148, + "grad_norm": 0.43359375, + "learning_rate": 8.386804624865851e-07, + "loss": 0.8636, + "step": 11670 + }, + { + "epoch": 14.440321583178726, + "grad_norm": 0.42578125, + "learning_rate": 8.201720366449283e-07, + "loss": 0.8605, + "step": 11675 + }, + { + "epoch": 14.446505875077303, + "grad_norm": 0.43359375, + "learning_rate": 8.018692816666118e-07, + "loss": 0.8529, + "step": 11680 + }, + { + "epoch": 14.45269016697588, + "grad_norm": 0.427734375, + "learning_rate": 7.837722355063637e-07, + "loss": 0.8594, + "step": 11685 + }, + { + "epoch": 14.458874458874458, + "grad_norm": 0.435546875, + "learning_rate": 7.658809356923424e-07, + "loss": 0.8562, + "step": 11690 + }, + { + "epoch": 14.465058750773036, + "grad_norm": 0.41796875, + "learning_rate": 7.481954193260143e-07, + "loss": 0.8563, + "step": 11695 + }, + { + "epoch": 14.471243042671613, + "grad_norm": 0.4140625, + "learning_rate": 7.307157230821426e-07, + "loss": 0.8515, + "step": 11700 + }, + { + "epoch": 14.477427334570192, + "grad_norm": 0.455078125, + "learning_rate": 7.134418832086653e-07, + "loss": 0.858, + "step": 11705 + }, + { + "epoch": 14.48361162646877, + "grad_norm": 0.431640625, + "learning_rate": 6.963739355266286e-07, + "loss": 0.8608, + "step": 11710 + }, + { + "epoch": 14.489795918367347, + "grad_norm": 0.423828125, + "learning_rate": 6.7951191543012e-07, + "loss": 0.8662, + "step": 11715 + }, + { + "epoch": 14.495980210265925, + "grad_norm": 0.455078125, + "learning_rate": 6.628558578862021e-07, + "loss": 0.86, + "step": 11720 + }, + { + "epoch": 14.502164502164502, + "grad_norm": 0.431640625, + "learning_rate": 6.464057974348014e-07, + "loss": 0.8587, + "step": 11725 + }, + { + "epoch": 14.50834879406308, + "grad_norm": 0.44921875, + "learning_rate": 6.301617681886863e-07, + "loss": 0.859, + "step": 11730 + }, + { + "epoch": 14.514533085961657, + "grad_norm": 0.408203125, + "learning_rate": 6.141238038333885e-07, + "loss": 0.8607, + "step": 11735 + }, + { + "epoch": 14.520717377860235, + "grad_norm": 0.4140625, + "learning_rate": 5.982919376270823e-07, + "loss": 0.865, + "step": 11740 + }, + { + "epoch": 14.526901669758812, + "grad_norm": 0.4296875, + "learning_rate": 5.826662024005835e-07, + "loss": 0.864, + "step": 11745 + }, + { + "epoch": 14.53308596165739, + "grad_norm": 0.41796875, + "learning_rate": 5.672466305572388e-07, + "loss": 0.8474, + "step": 11750 + }, + { + "epoch": 14.539270253555967, + "grad_norm": 0.421875, + "learning_rate": 5.52033254072859e-07, + "loss": 0.8571, + "step": 11755 + }, + { + "epoch": 14.545454545454545, + "grad_norm": 0.43359375, + "learning_rate": 5.370261044956971e-07, + "loss": 0.852, + "step": 11760 + }, + { + "epoch": 14.551638837353122, + "grad_norm": 0.419921875, + "learning_rate": 5.222252129463146e-07, + "loss": 0.8602, + "step": 11765 + }, + { + "epoch": 14.5578231292517, + "grad_norm": 0.43359375, + "learning_rate": 5.07630610117582e-07, + "loss": 0.8637, + "step": 11770 + }, + { + "epoch": 14.564007421150277, + "grad_norm": 0.419921875, + "learning_rate": 4.932423262745456e-07, + "loss": 0.8446, + "step": 11775 + }, + { + "epoch": 14.570191713048857, + "grad_norm": 0.4140625, + "learning_rate": 4.790603912544489e-07, + "loss": 0.8551, + "step": 11780 + }, + { + "epoch": 14.576376004947434, + "grad_norm": 0.4375, + "learning_rate": 4.6508483446661144e-07, + "loss": 0.8601, + "step": 11785 + }, + { + "epoch": 14.582560296846012, + "grad_norm": 0.4296875, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.862, + "step": 11790 + }, + { + "epoch": 14.58874458874459, + "grad_norm": 0.421875, + "learning_rate": 4.377529710850259e-07, + "loss": 0.8592, + "step": 11795 + }, + { + "epoch": 14.594928880643167, + "grad_norm": 0.427734375, + "learning_rate": 4.2439672116982855e-07, + "loss": 0.8491, + "step": 11800 + }, + { + "epoch": 14.601113172541744, + "grad_norm": 0.4296875, + "learning_rate": 4.112469628438365e-07, + "loss": 0.8512, + "step": 11805 + }, + { + "epoch": 14.607297464440322, + "grad_norm": 0.427734375, + "learning_rate": 3.983037233759368e-07, + "loss": 0.8562, + "step": 11810 + }, + { + "epoch": 14.6134817563389, + "grad_norm": 0.421875, + "learning_rate": 3.8556702960673706e-07, + "loss": 0.8498, + "step": 11815 + }, + { + "epoch": 14.619666048237477, + "grad_norm": 0.400390625, + "learning_rate": 3.73036907948543e-07, + "loss": 0.8548, + "step": 11820 + }, + { + "epoch": 14.625850340136054, + "grad_norm": 0.421875, + "learning_rate": 3.6071338438524726e-07, + "loss": 0.8642, + "step": 11825 + }, + { + "epoch": 14.632034632034632, + "grad_norm": 0.44140625, + "learning_rate": 3.485964844723744e-07, + "loss": 0.8464, + "step": 11830 + }, + { + "epoch": 14.63821892393321, + "grad_norm": 0.419921875, + "learning_rate": 3.366862333369358e-07, + "loss": 0.8568, + "step": 11835 + }, + { + "epoch": 14.644403215831787, + "grad_norm": 0.421875, + "learning_rate": 3.2498265567739717e-07, + "loss": 0.8662, + "step": 11840 + }, + { + "epoch": 14.650587507730364, + "grad_norm": 0.431640625, + "learning_rate": 3.134857757636889e-07, + "loss": 0.8558, + "step": 11845 + }, + { + "epoch": 14.656771799628942, + "grad_norm": 0.404296875, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.8455, + "step": 11850 + }, + { + "epoch": 14.66295609152752, + "grad_norm": 0.453125, + "learning_rate": 2.9111220411014437e-07, + "loss": 0.8558, + "step": 11855 + }, + { + "epoch": 14.669140383426098, + "grad_norm": 0.4609375, + "learning_rate": 2.8023555876673937e-07, + "loss": 0.85, + "step": 11860 + }, + { + "epoch": 14.675324675324676, + "grad_norm": 0.41796875, + "learning_rate": 2.6956570396197143e-07, + "loss": 0.848, + "step": 11865 + }, + { + "epoch": 14.681508967223253, + "grad_norm": 0.427734375, + "learning_rate": 2.5910266182207486e-07, + "loss": 0.8651, + "step": 11870 + }, + { + "epoch": 14.687693259121831, + "grad_norm": 0.4140625, + "learning_rate": 2.4884645404443795e-07, + "loss": 0.8467, + "step": 11875 + }, + { + "epoch": 14.693877551020408, + "grad_norm": 0.416015625, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.8597, + "step": 11880 + }, + { + "epoch": 14.700061842918986, + "grad_norm": 0.41015625, + "learning_rate": 2.289546262208786e-07, + "loss": 0.8578, + "step": 11885 + }, + { + "epoch": 14.706246134817563, + "grad_norm": 0.4453125, + "learning_rate": 2.1931904742495957e-07, + "loss": 0.8528, + "step": 11890 + }, + { + "epoch": 14.712430426716141, + "grad_norm": 0.435546875, + "learning_rate": 2.098903854912515e-07, + "loss": 0.8549, + "step": 11895 + }, + { + "epoch": 14.718614718614718, + "grad_norm": 0.423828125, + "learning_rate": 2.0066865997212525e-07, + "loss": 0.8537, + "step": 11900 + }, + { + "epoch": 14.724799010513296, + "grad_norm": 0.408203125, + "learning_rate": 1.9165388999082822e-07, + "loss": 0.8617, + "step": 11905 + }, + { + "epoch": 14.730983302411873, + "grad_norm": 0.447265625, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.858, + "step": 11910 + }, + { + "epoch": 14.737167594310451, + "grad_norm": 0.4296875, + "learning_rate": 1.7424529098881703e-07, + "loss": 0.8557, + "step": 11915 + }, + { + "epoch": 14.743351886209028, + "grad_norm": 0.4453125, + "learning_rate": 1.6585149806860324e-07, + "loss": 0.8588, + "step": 11920 + }, + { + "epoch": 14.749536178107606, + "grad_norm": 0.419921875, + "learning_rate": 1.5766473288715278e-07, + "loss": 0.8595, + "step": 11925 + }, + { + "epoch": 14.755720470006183, + "grad_norm": 0.41015625, + "learning_rate": 1.4968501242148547e-07, + "loss": 0.8498, + "step": 11930 + }, + { + "epoch": 14.761904761904763, + "grad_norm": 0.435546875, + "learning_rate": 1.4191235321928676e-07, + "loss": 0.8676, + "step": 11935 + }, + { + "epoch": 14.76808905380334, + "grad_norm": 0.43359375, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.8537, + "step": 11940 + }, + { + "epoch": 14.774273345701918, + "grad_norm": 0.435546875, + "learning_rate": 1.2698828264904317e-07, + "loss": 0.8622, + "step": 11945 + }, + { + "epoch": 14.780457637600495, + "grad_norm": 0.4609375, + "learning_rate": 1.1983690222929778e-07, + "loss": 0.8581, + "step": 11950 + }, + { + "epoch": 14.786641929499073, + "grad_norm": 0.4296875, + "learning_rate": 1.1289264496953111e-07, + "loss": 0.8554, + "step": 11955 + }, + { + "epoch": 14.79282622139765, + "grad_norm": 0.4453125, + "learning_rate": 1.0615552527017958e-07, + "loss": 0.8502, + "step": 11960 + }, + { + "epoch": 14.799010513296228, + "grad_norm": 0.419921875, + "learning_rate": 9.962555710212318e-08, + "loss": 0.8599, + "step": 11965 + }, + { + "epoch": 14.805194805194805, + "grad_norm": 0.4453125, + "learning_rate": 9.330275400666332e-08, + "loss": 0.8524, + "step": 11970 + }, + { + "epoch": 14.811379097093383, + "grad_norm": 0.42578125, + "learning_rate": 8.718712909548953e-08, + "loss": 0.8665, + "step": 11975 + }, + { + "epoch": 14.81756338899196, + "grad_norm": 0.42578125, + "learning_rate": 8.127869505069053e-08, + "loss": 0.8573, + "step": 11980 + }, + { + "epoch": 14.823747680890538, + "grad_norm": 0.404296875, + "learning_rate": 7.557746412468758e-08, + "loss": 0.8544, + "step": 11985 + }, + { + "epoch": 14.829931972789115, + "grad_norm": 0.404296875, + "learning_rate": 7.00834481402013e-08, + "loss": 0.8494, + "step": 11990 + }, + { + "epoch": 14.836116264687693, + "grad_norm": 0.4296875, + "learning_rate": 6.479665849027372e-08, + "loss": 0.8539, + "step": 11995 + }, + { + "epoch": 14.84230055658627, + "grad_norm": 0.427734375, + "learning_rate": 5.971710613821291e-08, + "loss": 0.8534, + "step": 12000 + }, + { + "epoch": 14.848484848484848, + "grad_norm": 0.423828125, + "learning_rate": 5.484480161755956e-08, + "loss": 0.8428, + "step": 12005 + }, + { + "epoch": 14.854669140383425, + "grad_norm": 0.455078125, + "learning_rate": 5.0179755032109253e-08, + "loss": 0.8615, + "step": 12010 + }, + { + "epoch": 14.860853432282005, + "grad_norm": 0.43359375, + "learning_rate": 4.572197605583473e-08, + "loss": 0.8527, + "step": 12015 + }, + { + "epoch": 14.867037724180582, + "grad_norm": 0.4375, + "learning_rate": 4.147147393290807e-08, + "loss": 0.861, + "step": 12020 + }, + { + "epoch": 14.87322201607916, + "grad_norm": 0.42578125, + "learning_rate": 3.742825747766743e-08, + "loss": 0.8517, + "step": 12025 + }, + { + "epoch": 14.879406307977737, + "grad_norm": 0.408203125, + "learning_rate": 3.359233507459481e-08, + "loss": 0.8382, + "step": 12030 + }, + { + "epoch": 14.885590599876314, + "grad_norm": 0.447265625, + "learning_rate": 2.9963714678316045e-08, + "loss": 0.8497, + "step": 12035 + }, + { + "epoch": 14.891774891774892, + "grad_norm": 0.412109375, + "learning_rate": 2.6542403813545334e-08, + "loss": 0.8573, + "step": 12040 + }, + { + "epoch": 14.89795918367347, + "grad_norm": 0.416015625, + "learning_rate": 2.3328409575129608e-08, + "loss": 0.8541, + "step": 12045 + }, + { + "epoch": 14.904143475572047, + "grad_norm": 0.4375, + "learning_rate": 2.0321738627981923e-08, + "loss": 0.856, + "step": 12050 + }, + { + "epoch": 14.910327767470624, + "grad_norm": 0.43359375, + "learning_rate": 1.7522397207070383e-08, + "loss": 0.8598, + "step": 12055 + }, + { + "epoch": 14.916512059369202, + "grad_norm": 0.45703125, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.8526, + "step": 12060 + }, + { + "epoch": 14.92269635126778, + "grad_norm": 0.419921875, + "learning_rate": 1.2545725734192103e-08, + "loss": 0.8637, + "step": 12065 + }, + { + "epoch": 14.928880643166357, + "grad_norm": 0.439453125, + "learning_rate": 1.0368406002436715e-08, + "loss": 0.8585, + "step": 12070 + }, + { + "epoch": 14.935064935064934, + "grad_norm": 0.412109375, + "learning_rate": 8.398436437317969e-09, + "loss": 0.8521, + "step": 12075 + }, + { + "epoch": 14.941249226963512, + "grad_norm": 0.416015625, + "learning_rate": 6.635821124001406e-09, + "loss": 0.8555, + "step": 12080 + }, + { + "epoch": 14.94743351886209, + "grad_norm": 0.423828125, + "learning_rate": 5.080563717629882e-09, + "loss": 0.8507, + "step": 12085 + }, + { + "epoch": 14.953617810760669, + "grad_norm": 0.431640625, + "learning_rate": 3.732667443390181e-09, + "loss": 0.8526, + "step": 12090 + }, + { + "epoch": 14.959802102659246, + "grad_norm": 0.431640625, + "learning_rate": 2.5921350964352997e-09, + "loss": 0.8538, + "step": 12095 + }, + { + "epoch": 14.965986394557824, + "grad_norm": 0.447265625, + "learning_rate": 1.6589690418955528e-09, + "loss": 0.8587, + "step": 12100 + }, + { + "epoch": 14.972170686456401, + "grad_norm": 0.427734375, + "learning_rate": 9.33171214889672e-10, + "loss": 0.8486, + "step": 12105 + }, + { + "epoch": 14.978354978354979, + "grad_norm": 0.421875, + "learning_rate": 4.147431205359098e-10, + "loss": 0.8622, + "step": 12110 + }, + { + "epoch": 14.984539270253556, + "grad_norm": 0.4453125, + "learning_rate": 1.0368583388542519e-10, + "loss": 0.8548, + "step": 12115 + }, + { + "epoch": 14.990723562152134, + "grad_norm": 0.431640625, + "learning_rate": 0.0, + "loss": 0.8485, + "step": 12120 + }, + { + "epoch": 14.990723562152134, + "eval_loss": 2.520606517791748, + "eval_runtime": 0.5387, + "eval_samples_per_second": 18.562, + "eval_steps_per_second": 1.856, + "step": 12120 + }, + { + "epoch": 14.990723562152134, + "step": 12120, + "total_flos": 7.118964864348848e+18, + "train_loss": 0.9255497357436139, + "train_runtime": 41659.6506, + "train_samples_per_second": 13.97, + "train_steps_per_second": 0.291 + } + ], + "logging_steps": 5, + "max_steps": 12120, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 100, + "total_flos": 7.118964864348848e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}