{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.876857306709765, "eval_steps": 1000, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.544243812561035, "learning_rate": 1.0427528675703859e-07, "loss": 1.8298, "step": 10 }, { "epoch": 0.0, "grad_norm": 7.121495723724365, "learning_rate": 2.0855057351407717e-07, "loss": 1.8251, "step": 20 }, { "epoch": 0.01, "grad_norm": 7.022141933441162, "learning_rate": 3.128258602711158e-07, "loss": 1.8055, "step": 30 }, { "epoch": 0.01, "grad_norm": 6.6211042404174805, "learning_rate": 4.1710114702815434e-07, "loss": 1.8076, "step": 40 }, { "epoch": 0.01, "grad_norm": 5.393233299255371, "learning_rate": 5.213764337851929e-07, "loss": 1.7492, "step": 50 }, { "epoch": 0.01, "grad_norm": 4.075599193572998, "learning_rate": 6.256517205422316e-07, "loss": 1.7474, "step": 60 }, { "epoch": 0.01, "grad_norm": 3.686915636062622, "learning_rate": 7.299270072992701e-07, "loss": 1.7052, "step": 70 }, { "epoch": 0.02, "grad_norm": 4.735376358032227, "learning_rate": 8.342022940563087e-07, "loss": 1.6499, "step": 80 }, { "epoch": 0.02, "grad_norm": 4.812263011932373, "learning_rate": 9.384775808133473e-07, "loss": 1.6299, "step": 90 }, { "epoch": 0.02, "grad_norm": 2.213949680328369, "learning_rate": 1.0427528675703859e-06, "loss": 1.6052, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.180210590362549, "learning_rate": 1.1470281543274244e-06, "loss": 1.5974, "step": 110 }, { "epoch": 0.03, "grad_norm": 2.1223864555358887, "learning_rate": 1.2513034410844632e-06, "loss": 1.5442, "step": 120 }, { "epoch": 0.03, "grad_norm": 2.524357557296753, "learning_rate": 1.3555787278415016e-06, "loss": 1.5487, "step": 130 }, { "epoch": 0.03, "grad_norm": 1.7285054922103882, "learning_rate": 1.4598540145985402e-06, "loss": 1.5652, "step": 140 }, { "epoch": 0.03, "grad_norm": 1.6190853118896484, "learning_rate": 1.5641293013555788e-06, "loss": 1.5206, "step": 150 }, { "epoch": 0.03, "grad_norm": 1.7167280912399292, "learning_rate": 1.6684045881126174e-06, "loss": 1.5195, "step": 160 }, { "epoch": 0.04, "grad_norm": 1.7101649045944214, "learning_rate": 1.7726798748696562e-06, "loss": 1.5388, "step": 170 }, { "epoch": 0.04, "grad_norm": 1.7588235139846802, "learning_rate": 1.8769551616266945e-06, "loss": 1.5155, "step": 180 }, { "epoch": 0.04, "grad_norm": 1.6155401468276978, "learning_rate": 1.981230448383733e-06, "loss": 1.5149, "step": 190 }, { "epoch": 0.04, "grad_norm": 1.809321403503418, "learning_rate": 2.0855057351407717e-06, "loss": 1.5159, "step": 200 }, { "epoch": 0.04, "grad_norm": 1.6688252687454224, "learning_rate": 2.1897810218978103e-06, "loss": 1.4955, "step": 210 }, { "epoch": 0.05, "grad_norm": 1.6307278871536255, "learning_rate": 2.294056308654849e-06, "loss": 1.5238, "step": 220 }, { "epoch": 0.05, "grad_norm": 1.4807227849960327, "learning_rate": 2.398331595411888e-06, "loss": 1.5086, "step": 230 }, { "epoch": 0.05, "grad_norm": 1.4992481470108032, "learning_rate": 2.5026068821689265e-06, "loss": 1.469, "step": 240 }, { "epoch": 0.05, "grad_norm": 1.8425967693328857, "learning_rate": 2.606882168925965e-06, "loss": 1.5177, "step": 250 }, { "epoch": 0.05, "grad_norm": 1.5325536727905273, "learning_rate": 2.7111574556830032e-06, "loss": 1.4757, "step": 260 }, { "epoch": 0.06, "grad_norm": 1.5502701997756958, "learning_rate": 2.815432742440042e-06, "loss": 1.4808, "step": 270 }, { "epoch": 0.06, "grad_norm": 1.5416339635849, "learning_rate": 2.9197080291970804e-06, "loss": 1.493, "step": 280 }, { "epoch": 0.06, "grad_norm": 1.7341797351837158, "learning_rate": 3.0239833159541194e-06, "loss": 1.4425, "step": 290 }, { "epoch": 0.06, "grad_norm": 1.4314051866531372, "learning_rate": 3.1282586027111576e-06, "loss": 1.4498, "step": 300 }, { "epoch": 0.06, "grad_norm": 1.6512941122055054, "learning_rate": 3.232533889468196e-06, "loss": 1.4687, "step": 310 }, { "epoch": 0.07, "grad_norm": 1.5534512996673584, "learning_rate": 3.3368091762252348e-06, "loss": 1.4615, "step": 320 }, { "epoch": 0.07, "grad_norm": 1.5043481588363647, "learning_rate": 3.4410844629822738e-06, "loss": 1.4729, "step": 330 }, { "epoch": 0.07, "grad_norm": 1.6013394594192505, "learning_rate": 3.5453597497393123e-06, "loss": 1.4477, "step": 340 }, { "epoch": 0.07, "grad_norm": 1.5246102809906006, "learning_rate": 3.6496350364963505e-06, "loss": 1.4602, "step": 350 }, { "epoch": 0.08, "grad_norm": 1.5013917684555054, "learning_rate": 3.753910323253389e-06, "loss": 1.4512, "step": 360 }, { "epoch": 0.08, "grad_norm": 1.8450820446014404, "learning_rate": 3.858185610010428e-06, "loss": 1.4804, "step": 370 }, { "epoch": 0.08, "grad_norm": 1.5352061986923218, "learning_rate": 3.962460896767466e-06, "loss": 1.4408, "step": 380 }, { "epoch": 0.08, "grad_norm": 1.5943782329559326, "learning_rate": 4.066736183524505e-06, "loss": 1.4231, "step": 390 }, { "epoch": 0.08, "grad_norm": 1.7596439123153687, "learning_rate": 4.1710114702815434e-06, "loss": 1.446, "step": 400 }, { "epoch": 0.09, "grad_norm": 1.470418095588684, "learning_rate": 4.275286757038582e-06, "loss": 1.4482, "step": 410 }, { "epoch": 0.09, "grad_norm": 1.4170681238174438, "learning_rate": 4.379562043795621e-06, "loss": 1.4475, "step": 420 }, { "epoch": 0.09, "grad_norm": 1.409445881843567, "learning_rate": 4.483837330552659e-06, "loss": 1.4355, "step": 430 }, { "epoch": 0.09, "grad_norm": 1.552442193031311, "learning_rate": 4.588112617309698e-06, "loss": 1.4281, "step": 440 }, { "epoch": 0.09, "grad_norm": 1.527559518814087, "learning_rate": 4.692387904066736e-06, "loss": 1.4482, "step": 450 }, { "epoch": 0.1, "grad_norm": 1.4792981147766113, "learning_rate": 4.796663190823776e-06, "loss": 1.422, "step": 460 }, { "epoch": 0.1, "grad_norm": 1.5031851530075073, "learning_rate": 4.9009384775808135e-06, "loss": 1.415, "step": 470 }, { "epoch": 0.1, "grad_norm": 1.4678523540496826, "learning_rate": 5.005213764337853e-06, "loss": 1.4362, "step": 480 }, { "epoch": 0.1, "grad_norm": 1.4980840682983398, "learning_rate": 5.1094890510948916e-06, "loss": 1.4098, "step": 490 }, { "epoch": 0.1, "grad_norm": 1.487581491470337, "learning_rate": 5.21376433785193e-06, "loss": 1.4337, "step": 500 }, { "epoch": 0.11, "grad_norm": 1.5128278732299805, "learning_rate": 5.318039624608968e-06, "loss": 1.3967, "step": 510 }, { "epoch": 0.11, "grad_norm": 1.463364839553833, "learning_rate": 5.4223149113660065e-06, "loss": 1.4004, "step": 520 }, { "epoch": 0.11, "grad_norm": 1.5302982330322266, "learning_rate": 5.526590198123045e-06, "loss": 1.4289, "step": 530 }, { "epoch": 0.11, "grad_norm": 1.3981858491897583, "learning_rate": 5.630865484880084e-06, "loss": 1.4046, "step": 540 }, { "epoch": 0.11, "grad_norm": 1.553695797920227, "learning_rate": 5.735140771637122e-06, "loss": 1.4155, "step": 550 }, { "epoch": 0.12, "grad_norm": 1.411907434463501, "learning_rate": 5.839416058394161e-06, "loss": 1.4248, "step": 560 }, { "epoch": 0.12, "grad_norm": 1.7118998765945435, "learning_rate": 5.9436913451512e-06, "loss": 1.3852, "step": 570 }, { "epoch": 0.12, "grad_norm": 1.4115082025527954, "learning_rate": 6.047966631908239e-06, "loss": 1.4028, "step": 580 }, { "epoch": 0.12, "grad_norm": 1.365872859954834, "learning_rate": 6.152241918665277e-06, "loss": 1.407, "step": 590 }, { "epoch": 0.13, "grad_norm": 1.4455780982971191, "learning_rate": 6.256517205422315e-06, "loss": 1.4141, "step": 600 }, { "epoch": 0.13, "grad_norm": 1.3968677520751953, "learning_rate": 6.360792492179354e-06, "loss": 1.3952, "step": 610 }, { "epoch": 0.13, "grad_norm": 1.5289356708526611, "learning_rate": 6.465067778936392e-06, "loss": 1.3892, "step": 620 }, { "epoch": 0.13, "grad_norm": 1.7357593774795532, "learning_rate": 6.569343065693431e-06, "loss": 1.4117, "step": 630 }, { "epoch": 0.13, "grad_norm": 1.5576961040496826, "learning_rate": 6.6736183524504695e-06, "loss": 1.3937, "step": 640 }, { "epoch": 0.14, "grad_norm": 1.3234304189682007, "learning_rate": 6.777893639207508e-06, "loss": 1.3851, "step": 650 }, { "epoch": 0.14, "grad_norm": 1.4000426530838013, "learning_rate": 6.8821689259645475e-06, "loss": 1.398, "step": 660 }, { "epoch": 0.14, "grad_norm": 1.5590957403182983, "learning_rate": 6.986444212721586e-06, "loss": 1.3891, "step": 670 }, { "epoch": 0.14, "grad_norm": 1.4518722295761108, "learning_rate": 7.090719499478625e-06, "loss": 1.404, "step": 680 }, { "epoch": 0.14, "grad_norm": 1.5161577463150024, "learning_rate": 7.1949947862356624e-06, "loss": 1.4004, "step": 690 }, { "epoch": 0.15, "grad_norm": 1.3116180896759033, "learning_rate": 7.299270072992701e-06, "loss": 1.3831, "step": 700 }, { "epoch": 0.15, "grad_norm": 1.5861413478851318, "learning_rate": 7.40354535974974e-06, "loss": 1.3939, "step": 710 }, { "epoch": 0.15, "grad_norm": 1.6353439092636108, "learning_rate": 7.507820646506778e-06, "loss": 1.4047, "step": 720 }, { "epoch": 0.15, "grad_norm": 1.4873729944229126, "learning_rate": 7.612095933263817e-06, "loss": 1.4024, "step": 730 }, { "epoch": 0.15, "grad_norm": 1.4138253927230835, "learning_rate": 7.716371220020855e-06, "loss": 1.4055, "step": 740 }, { "epoch": 0.16, "grad_norm": 1.4762160778045654, "learning_rate": 7.820646506777895e-06, "loss": 1.3882, "step": 750 }, { "epoch": 0.16, "grad_norm": 1.3965837955474854, "learning_rate": 7.924921793534933e-06, "loss": 1.3633, "step": 760 }, { "epoch": 0.16, "grad_norm": 1.4030544757843018, "learning_rate": 8.029197080291972e-06, "loss": 1.3644, "step": 770 }, { "epoch": 0.16, "grad_norm": 1.4032145738601685, "learning_rate": 8.13347236704901e-06, "loss": 1.387, "step": 780 }, { "epoch": 0.16, "grad_norm": 1.5707696676254272, "learning_rate": 8.237747653806049e-06, "loss": 1.3528, "step": 790 }, { "epoch": 0.17, "grad_norm": 1.4654338359832764, "learning_rate": 8.342022940563087e-06, "loss": 1.4175, "step": 800 }, { "epoch": 0.17, "grad_norm": 1.5189216136932373, "learning_rate": 8.446298227320126e-06, "loss": 1.365, "step": 810 }, { "epoch": 0.17, "grad_norm": 1.4133602380752563, "learning_rate": 8.550573514077164e-06, "loss": 1.3441, "step": 820 }, { "epoch": 0.17, "grad_norm": 1.5661669969558716, "learning_rate": 8.654848800834203e-06, "loss": 1.3698, "step": 830 }, { "epoch": 0.18, "grad_norm": 1.5398190021514893, "learning_rate": 8.759124087591241e-06, "loss": 1.3708, "step": 840 }, { "epoch": 0.18, "grad_norm": 1.4412083625793457, "learning_rate": 8.86339937434828e-06, "loss": 1.3779, "step": 850 }, { "epoch": 0.18, "grad_norm": 1.3615409135818481, "learning_rate": 8.967674661105318e-06, "loss": 1.3742, "step": 860 }, { "epoch": 0.18, "grad_norm": 1.4569330215454102, "learning_rate": 9.071949947862358e-06, "loss": 1.3732, "step": 870 }, { "epoch": 0.18, "grad_norm": 1.5276408195495605, "learning_rate": 9.176225234619396e-06, "loss": 1.3875, "step": 880 }, { "epoch": 0.19, "grad_norm": 1.3974804878234863, "learning_rate": 9.280500521376435e-06, "loss": 1.3681, "step": 890 }, { "epoch": 0.19, "grad_norm": 1.4797160625457764, "learning_rate": 9.384775808133473e-06, "loss": 1.3544, "step": 900 }, { "epoch": 0.19, "grad_norm": 1.4647297859191895, "learning_rate": 9.48905109489051e-06, "loss": 1.3627, "step": 910 }, { "epoch": 0.19, "grad_norm": 1.4966884851455688, "learning_rate": 9.593326381647552e-06, "loss": 1.3628, "step": 920 }, { "epoch": 0.19, "grad_norm": 1.4639487266540527, "learning_rate": 9.69760166840459e-06, "loss": 1.3687, "step": 930 }, { "epoch": 0.2, "grad_norm": 1.4451621770858765, "learning_rate": 9.801876955161627e-06, "loss": 1.3855, "step": 940 }, { "epoch": 0.2, "grad_norm": 1.382853388786316, "learning_rate": 9.906152241918667e-06, "loss": 1.3615, "step": 950 }, { "epoch": 0.2, "grad_norm": 1.2870492935180664, "learning_rate": 9.9988413857027e-06, "loss": 1.3495, "step": 960 }, { "epoch": 0.2, "grad_norm": 1.3984755277633667, "learning_rate": 9.987255242729697e-06, "loss": 1.3597, "step": 970 }, { "epoch": 0.2, "grad_norm": 1.3395270109176636, "learning_rate": 9.975669099756692e-06, "loss": 1.3536, "step": 980 }, { "epoch": 0.21, "grad_norm": 1.303995132446289, "learning_rate": 9.964082956783688e-06, "loss": 1.3537, "step": 990 }, { "epoch": 0.21, "grad_norm": 1.629632830619812, "learning_rate": 9.952496813810684e-06, "loss": 1.3498, "step": 1000 }, { "epoch": 0.21, "eval_loss": 1.3506057262420654, "eval_runtime": 60.4268, "eval_samples_per_second": 102.603, "eval_steps_per_second": 12.825, "step": 1000 }, { "epoch": 0.21, "grad_norm": 1.3876349925994873, "learning_rate": 9.940910670837679e-06, "loss": 1.3521, "step": 1010 }, { "epoch": 0.21, "grad_norm": 1.4891064167022705, "learning_rate": 9.929324527864674e-06, "loss": 1.3506, "step": 1020 }, { "epoch": 0.21, "grad_norm": 1.3278287649154663, "learning_rate": 9.91773838489167e-06, "loss": 1.3458, "step": 1030 }, { "epoch": 0.22, "grad_norm": 1.5127308368682861, "learning_rate": 9.906152241918667e-06, "loss": 1.3477, "step": 1040 }, { "epoch": 0.22, "grad_norm": 1.3162195682525635, "learning_rate": 9.894566098945661e-06, "loss": 1.3444, "step": 1050 }, { "epoch": 0.22, "grad_norm": 1.4123969078063965, "learning_rate": 9.882979955972658e-06, "loss": 1.3568, "step": 1060 }, { "epoch": 0.22, "grad_norm": 1.531692624092102, "learning_rate": 9.871393812999654e-06, "loss": 1.3464, "step": 1070 }, { "epoch": 0.23, "grad_norm": 1.4226391315460205, "learning_rate": 9.859807670026649e-06, "loss": 1.3327, "step": 1080 }, { "epoch": 0.23, "grad_norm": 1.4860808849334717, "learning_rate": 9.848221527053643e-06, "loss": 1.3826, "step": 1090 }, { "epoch": 0.23, "grad_norm": 1.4818434715270996, "learning_rate": 9.836635384080641e-06, "loss": 1.3327, "step": 1100 }, { "epoch": 0.23, "grad_norm": 1.58639395236969, "learning_rate": 9.825049241107636e-06, "loss": 1.3461, "step": 1110 }, { "epoch": 0.23, "grad_norm": 1.4077178239822388, "learning_rate": 9.81346309813463e-06, "loss": 1.309, "step": 1120 }, { "epoch": 0.24, "grad_norm": 1.3896112442016602, "learning_rate": 9.801876955161627e-06, "loss": 1.3266, "step": 1130 }, { "epoch": 0.24, "grad_norm": 1.508527159690857, "learning_rate": 9.790290812188623e-06, "loss": 1.3764, "step": 1140 }, { "epoch": 0.24, "grad_norm": 1.311788558959961, "learning_rate": 9.778704669215618e-06, "loss": 1.3445, "step": 1150 }, { "epoch": 0.24, "grad_norm": 1.4742310047149658, "learning_rate": 9.767118526242615e-06, "loss": 1.3338, "step": 1160 }, { "epoch": 0.24, "grad_norm": 1.3195024728775024, "learning_rate": 9.755532383269611e-06, "loss": 1.3444, "step": 1170 }, { "epoch": 0.25, "grad_norm": 1.4474618434906006, "learning_rate": 9.743946240296606e-06, "loss": 1.3332, "step": 1180 }, { "epoch": 0.25, "grad_norm": 1.3813029527664185, "learning_rate": 9.732360097323602e-06, "loss": 1.3745, "step": 1190 }, { "epoch": 0.25, "grad_norm": 1.370364785194397, "learning_rate": 9.720773954350598e-06, "loss": 1.3124, "step": 1200 }, { "epoch": 0.25, "grad_norm": 1.4230958223342896, "learning_rate": 9.709187811377593e-06, "loss": 1.3224, "step": 1210 }, { "epoch": 0.25, "grad_norm": 1.5277326107025146, "learning_rate": 9.69760166840459e-06, "loss": 1.3368, "step": 1220 }, { "epoch": 0.26, "grad_norm": 1.424777626991272, "learning_rate": 9.686015525431584e-06, "loss": 1.3202, "step": 1230 }, { "epoch": 0.26, "grad_norm": 1.5502588748931885, "learning_rate": 9.67442938245858e-06, "loss": 1.3127, "step": 1240 }, { "epoch": 0.26, "grad_norm": 1.5083343982696533, "learning_rate": 9.662843239485577e-06, "loss": 1.3234, "step": 1250 }, { "epoch": 0.26, "grad_norm": 1.3840395212173462, "learning_rate": 9.651257096512571e-06, "loss": 1.3241, "step": 1260 }, { "epoch": 0.26, "grad_norm": 1.3413887023925781, "learning_rate": 9.639670953539568e-06, "loss": 1.3247, "step": 1270 }, { "epoch": 0.27, "grad_norm": 1.3408432006835938, "learning_rate": 9.628084810566564e-06, "loss": 1.3439, "step": 1280 }, { "epoch": 0.27, "grad_norm": 1.302042007446289, "learning_rate": 9.616498667593559e-06, "loss": 1.3261, "step": 1290 }, { "epoch": 0.27, "grad_norm": 1.3523017168045044, "learning_rate": 9.604912524620554e-06, "loss": 1.3412, "step": 1300 }, { "epoch": 0.27, "grad_norm": 1.3820520639419556, "learning_rate": 9.593326381647552e-06, "loss": 1.3457, "step": 1310 }, { "epoch": 0.28, "grad_norm": 1.5620774030685425, "learning_rate": 9.581740238674546e-06, "loss": 1.3086, "step": 1320 }, { "epoch": 0.28, "grad_norm": 1.440212607383728, "learning_rate": 9.570154095701541e-06, "loss": 1.3197, "step": 1330 }, { "epoch": 0.28, "grad_norm": 1.2081576585769653, "learning_rate": 9.558567952728537e-06, "loss": 1.314, "step": 1340 }, { "epoch": 0.28, "grad_norm": 1.5691925287246704, "learning_rate": 9.546981809755534e-06, "loss": 1.319, "step": 1350 }, { "epoch": 0.28, "grad_norm": 1.5542395114898682, "learning_rate": 9.535395666782528e-06, "loss": 1.293, "step": 1360 }, { "epoch": 0.29, "grad_norm": 1.3244450092315674, "learning_rate": 9.523809523809525e-06, "loss": 1.3092, "step": 1370 }, { "epoch": 0.29, "grad_norm": 1.5976145267486572, "learning_rate": 9.512223380836521e-06, "loss": 1.3379, "step": 1380 }, { "epoch": 0.29, "grad_norm": 1.3805755376815796, "learning_rate": 9.500637237863516e-06, "loss": 1.3325, "step": 1390 }, { "epoch": 0.29, "grad_norm": 1.6162505149841309, "learning_rate": 9.48905109489051e-06, "loss": 1.3056, "step": 1400 }, { "epoch": 0.29, "grad_norm": 1.4099946022033691, "learning_rate": 9.477464951917509e-06, "loss": 1.3308, "step": 1410 }, { "epoch": 0.3, "grad_norm": 1.3236525058746338, "learning_rate": 9.465878808944503e-06, "loss": 1.3299, "step": 1420 }, { "epoch": 0.3, "grad_norm": 1.3962891101837158, "learning_rate": 9.454292665971498e-06, "loss": 1.3196, "step": 1430 }, { "epoch": 0.3, "grad_norm": 1.3816227912902832, "learning_rate": 9.442706522998494e-06, "loss": 1.3104, "step": 1440 }, { "epoch": 0.3, "grad_norm": 1.3028792142868042, "learning_rate": 9.43112038002549e-06, "loss": 1.3022, "step": 1450 }, { "epoch": 0.3, "grad_norm": 1.361894965171814, "learning_rate": 9.419534237052485e-06, "loss": 1.3456, "step": 1460 }, { "epoch": 0.31, "grad_norm": 1.392577886581421, "learning_rate": 9.407948094079482e-06, "loss": 1.3036, "step": 1470 }, { "epoch": 0.31, "grad_norm": 1.4495409727096558, "learning_rate": 9.396361951106478e-06, "loss": 1.3233, "step": 1480 }, { "epoch": 0.31, "grad_norm": 1.3723502159118652, "learning_rate": 9.384775808133473e-06, "loss": 1.3269, "step": 1490 }, { "epoch": 0.31, "grad_norm": 1.2673066854476929, "learning_rate": 9.373189665160469e-06, "loss": 1.2977, "step": 1500 }, { "epoch": 0.31, "grad_norm": 1.491845965385437, "learning_rate": 9.361603522187464e-06, "loss": 1.3207, "step": 1510 }, { "epoch": 0.32, "grad_norm": 1.456560730934143, "learning_rate": 9.35001737921446e-06, "loss": 1.3064, "step": 1520 }, { "epoch": 0.32, "grad_norm": 1.399476170539856, "learning_rate": 9.338431236241457e-06, "loss": 1.3232, "step": 1530 }, { "epoch": 0.32, "grad_norm": 1.3272968530654907, "learning_rate": 9.326845093268451e-06, "loss": 1.3271, "step": 1540 }, { "epoch": 0.32, "grad_norm": 1.3097448348999023, "learning_rate": 9.315258950295448e-06, "loss": 1.3015, "step": 1550 }, { "epoch": 0.33, "grad_norm": 1.3764145374298096, "learning_rate": 9.303672807322444e-06, "loss": 1.3044, "step": 1560 }, { "epoch": 0.33, "grad_norm": 1.4176369905471802, "learning_rate": 9.292086664349439e-06, "loss": 1.3069, "step": 1570 }, { "epoch": 0.33, "grad_norm": 1.3119934797286987, "learning_rate": 9.280500521376435e-06, "loss": 1.3008, "step": 1580 }, { "epoch": 0.33, "grad_norm": 1.2886179685592651, "learning_rate": 9.268914378403431e-06, "loss": 1.2901, "step": 1590 }, { "epoch": 0.33, "grad_norm": 1.2770109176635742, "learning_rate": 9.257328235430426e-06, "loss": 1.2761, "step": 1600 }, { "epoch": 0.34, "grad_norm": 1.4204245805740356, "learning_rate": 9.24574209245742e-06, "loss": 1.2895, "step": 1610 }, { "epoch": 0.34, "grad_norm": 1.4266266822814941, "learning_rate": 9.234155949484417e-06, "loss": 1.2987, "step": 1620 }, { "epoch": 0.34, "grad_norm": 1.3138302564620972, "learning_rate": 9.222569806511413e-06, "loss": 1.3048, "step": 1630 }, { "epoch": 0.34, "grad_norm": 1.3465932607650757, "learning_rate": 9.210983663538408e-06, "loss": 1.3133, "step": 1640 }, { "epoch": 0.34, "grad_norm": 1.4104775190353394, "learning_rate": 9.199397520565405e-06, "loss": 1.3001, "step": 1650 }, { "epoch": 0.35, "grad_norm": 1.3274825811386108, "learning_rate": 9.187811377592401e-06, "loss": 1.3009, "step": 1660 }, { "epoch": 0.35, "grad_norm": 1.4182919263839722, "learning_rate": 9.176225234619396e-06, "loss": 1.3104, "step": 1670 }, { "epoch": 0.35, "grad_norm": 1.34902024269104, "learning_rate": 9.164639091646392e-06, "loss": 1.281, "step": 1680 }, { "epoch": 0.35, "grad_norm": 1.463240146636963, "learning_rate": 9.153052948673388e-06, "loss": 1.3048, "step": 1690 }, { "epoch": 0.35, "grad_norm": 1.3565565347671509, "learning_rate": 9.141466805700383e-06, "loss": 1.2868, "step": 1700 }, { "epoch": 0.36, "grad_norm": 1.2957673072814941, "learning_rate": 9.129880662727378e-06, "loss": 1.2985, "step": 1710 }, { "epoch": 0.36, "grad_norm": 1.392398476600647, "learning_rate": 9.118294519754374e-06, "loss": 1.2876, "step": 1720 }, { "epoch": 0.36, "grad_norm": 1.4857003688812256, "learning_rate": 9.10670837678137e-06, "loss": 1.3095, "step": 1730 }, { "epoch": 0.36, "grad_norm": 1.3010202646255493, "learning_rate": 9.095122233808365e-06, "loss": 1.2914, "step": 1740 }, { "epoch": 0.36, "grad_norm": 1.4109833240509033, "learning_rate": 9.083536090835361e-06, "loss": 1.2908, "step": 1750 }, { "epoch": 0.37, "grad_norm": 1.4451119899749756, "learning_rate": 9.071949947862358e-06, "loss": 1.301, "step": 1760 }, { "epoch": 0.37, "grad_norm": 1.3937684297561646, "learning_rate": 9.060363804889353e-06, "loss": 1.3257, "step": 1770 }, { "epoch": 0.37, "grad_norm": 1.2829424142837524, "learning_rate": 9.048777661916349e-06, "loss": 1.2825, "step": 1780 }, { "epoch": 0.37, "grad_norm": 1.3617833852767944, "learning_rate": 9.037191518943345e-06, "loss": 1.2924, "step": 1790 }, { "epoch": 0.38, "grad_norm": 1.2105286121368408, "learning_rate": 9.02560537597034e-06, "loss": 1.312, "step": 1800 }, { "epoch": 0.38, "grad_norm": 1.2995182275772095, "learning_rate": 9.014019232997336e-06, "loss": 1.2883, "step": 1810 }, { "epoch": 0.38, "grad_norm": 1.3444055318832397, "learning_rate": 9.002433090024331e-06, "loss": 1.2948, "step": 1820 }, { "epoch": 0.38, "grad_norm": 1.4586807489395142, "learning_rate": 8.990846947051327e-06, "loss": 1.3089, "step": 1830 }, { "epoch": 0.38, "grad_norm": 1.3522496223449707, "learning_rate": 8.979260804078324e-06, "loss": 1.309, "step": 1840 }, { "epoch": 0.39, "grad_norm": 1.3606278896331787, "learning_rate": 8.967674661105318e-06, "loss": 1.284, "step": 1850 }, { "epoch": 0.39, "grad_norm": 1.3653103113174438, "learning_rate": 8.956088518132315e-06, "loss": 1.2998, "step": 1860 }, { "epoch": 0.39, "grad_norm": 1.3912575244903564, "learning_rate": 8.944502375159311e-06, "loss": 1.292, "step": 1870 }, { "epoch": 0.39, "grad_norm": 1.2404125928878784, "learning_rate": 8.932916232186306e-06, "loss": 1.3031, "step": 1880 }, { "epoch": 0.39, "grad_norm": 1.4737727642059326, "learning_rate": 8.921330089213302e-06, "loss": 1.274, "step": 1890 }, { "epoch": 0.4, "grad_norm": 1.5364305973052979, "learning_rate": 8.909743946240297e-06, "loss": 1.2855, "step": 1900 }, { "epoch": 0.4, "grad_norm": 1.2872428894042969, "learning_rate": 8.898157803267293e-06, "loss": 1.2933, "step": 1910 }, { "epoch": 0.4, "grad_norm": 1.2924387454986572, "learning_rate": 8.886571660294288e-06, "loss": 1.2801, "step": 1920 }, { "epoch": 0.4, "grad_norm": 1.4313395023345947, "learning_rate": 8.874985517321284e-06, "loss": 1.2686, "step": 1930 }, { "epoch": 0.4, "grad_norm": 1.3312171697616577, "learning_rate": 8.86339937434828e-06, "loss": 1.2913, "step": 1940 }, { "epoch": 0.41, "grad_norm": 1.3304767608642578, "learning_rate": 8.851813231375275e-06, "loss": 1.3036, "step": 1950 }, { "epoch": 0.41, "grad_norm": 1.3129816055297852, "learning_rate": 8.840227088402272e-06, "loss": 1.3102, "step": 1960 }, { "epoch": 0.41, "grad_norm": 1.4595379829406738, "learning_rate": 8.828640945429268e-06, "loss": 1.2758, "step": 1970 }, { "epoch": 0.41, "grad_norm": 1.3031460046768188, "learning_rate": 8.817054802456263e-06, "loss": 1.2598, "step": 1980 }, { "epoch": 0.41, "grad_norm": 1.4015743732452393, "learning_rate": 8.805468659483259e-06, "loss": 1.2897, "step": 1990 }, { "epoch": 0.42, "grad_norm": 1.456390380859375, "learning_rate": 8.793882516510255e-06, "loss": 1.2952, "step": 2000 }, { "epoch": 0.42, "eval_loss": 1.2834819555282593, "eval_runtime": 60.3193, "eval_samples_per_second": 102.786, "eval_steps_per_second": 12.848, "step": 2000 }, { "epoch": 0.42, "grad_norm": 1.4190367460250854, "learning_rate": 8.78229637353725e-06, "loss": 1.2863, "step": 2010 }, { "epoch": 0.42, "grad_norm": 1.3781564235687256, "learning_rate": 8.770710230564245e-06, "loss": 1.2898, "step": 2020 }, { "epoch": 0.42, "grad_norm": 1.4007055759429932, "learning_rate": 8.759124087591241e-06, "loss": 1.3049, "step": 2030 }, { "epoch": 0.43, "grad_norm": 1.46671462059021, "learning_rate": 8.747537944618238e-06, "loss": 1.2931, "step": 2040 }, { "epoch": 0.43, "grad_norm": 1.3067024946212769, "learning_rate": 8.735951801645232e-06, "loss": 1.2758, "step": 2050 }, { "epoch": 0.43, "grad_norm": 1.3156636953353882, "learning_rate": 8.724365658672229e-06, "loss": 1.2912, "step": 2060 }, { "epoch": 0.43, "grad_norm": 1.4328694343566895, "learning_rate": 8.712779515699225e-06, "loss": 1.2687, "step": 2070 }, { "epoch": 0.43, "grad_norm": 1.3482651710510254, "learning_rate": 8.70119337272622e-06, "loss": 1.2815, "step": 2080 }, { "epoch": 0.44, "grad_norm": 1.3155994415283203, "learning_rate": 8.689607229753216e-06, "loss": 1.2646, "step": 2090 }, { "epoch": 0.44, "grad_norm": 1.3547475337982178, "learning_rate": 8.678021086780212e-06, "loss": 1.2726, "step": 2100 }, { "epoch": 0.44, "grad_norm": 1.439499020576477, "learning_rate": 8.666434943807207e-06, "loss": 1.2896, "step": 2110 }, { "epoch": 0.44, "grad_norm": 1.302492618560791, "learning_rate": 8.654848800834203e-06, "loss": 1.263, "step": 2120 }, { "epoch": 0.44, "grad_norm": 1.2857979536056519, "learning_rate": 8.643262657861198e-06, "loss": 1.2636, "step": 2130 }, { "epoch": 0.45, "grad_norm": 1.362838864326477, "learning_rate": 8.631676514888195e-06, "loss": 1.2554, "step": 2140 }, { "epoch": 0.45, "grad_norm": 1.4692515134811401, "learning_rate": 8.62009037191519e-06, "loss": 1.2589, "step": 2150 }, { "epoch": 0.45, "grad_norm": 1.3445467948913574, "learning_rate": 8.608504228942186e-06, "loss": 1.2829, "step": 2160 }, { "epoch": 0.45, "grad_norm": 1.2820574045181274, "learning_rate": 8.596918085969182e-06, "loss": 1.266, "step": 2170 }, { "epoch": 0.45, "grad_norm": 1.4762849807739258, "learning_rate": 8.585331942996177e-06, "loss": 1.2924, "step": 2180 }, { "epoch": 0.46, "grad_norm": 1.3395150899887085, "learning_rate": 8.573745800023173e-06, "loss": 1.2724, "step": 2190 }, { "epoch": 0.46, "grad_norm": 1.4047057628631592, "learning_rate": 8.56215965705017e-06, "loss": 1.2877, "step": 2200 }, { "epoch": 0.46, "grad_norm": 1.3061277866363525, "learning_rate": 8.550573514077164e-06, "loss": 1.2805, "step": 2210 }, { "epoch": 0.46, "grad_norm": 1.300742268562317, "learning_rate": 8.53898737110416e-06, "loss": 1.2851, "step": 2220 }, { "epoch": 0.47, "grad_norm": 1.3588734865188599, "learning_rate": 8.527401228131155e-06, "loss": 1.2537, "step": 2230 }, { "epoch": 0.47, "grad_norm": 1.4370704889297485, "learning_rate": 8.515815085158151e-06, "loss": 1.2779, "step": 2240 }, { "epoch": 0.47, "grad_norm": 1.3714896440505981, "learning_rate": 8.504228942185148e-06, "loss": 1.2707, "step": 2250 }, { "epoch": 0.47, "grad_norm": 1.4633715152740479, "learning_rate": 8.492642799212143e-06, "loss": 1.2882, "step": 2260 }, { "epoch": 0.47, "grad_norm": 1.459731101989746, "learning_rate": 8.481056656239139e-06, "loss": 1.2896, "step": 2270 }, { "epoch": 0.48, "grad_norm": 1.3027414083480835, "learning_rate": 8.469470513266135e-06, "loss": 1.2703, "step": 2280 }, { "epoch": 0.48, "grad_norm": 1.460031509399414, "learning_rate": 8.45788437029313e-06, "loss": 1.2871, "step": 2290 }, { "epoch": 0.48, "grad_norm": 1.3085455894470215, "learning_rate": 8.446298227320126e-06, "loss": 1.2552, "step": 2300 }, { "epoch": 0.48, "grad_norm": 1.2865551710128784, "learning_rate": 8.434712084347123e-06, "loss": 1.2706, "step": 2310 }, { "epoch": 0.48, "grad_norm": 1.357158899307251, "learning_rate": 8.423125941374117e-06, "loss": 1.2862, "step": 2320 }, { "epoch": 0.49, "grad_norm": 1.4073933362960815, "learning_rate": 8.411539798401112e-06, "loss": 1.2753, "step": 2330 }, { "epoch": 0.49, "grad_norm": 1.3302937746047974, "learning_rate": 8.399953655428108e-06, "loss": 1.2641, "step": 2340 }, { "epoch": 0.49, "grad_norm": 1.2936241626739502, "learning_rate": 8.388367512455105e-06, "loss": 1.2823, "step": 2350 }, { "epoch": 0.49, "grad_norm": 1.339048147201538, "learning_rate": 8.3767813694821e-06, "loss": 1.2477, "step": 2360 }, { "epoch": 0.49, "grad_norm": 1.3758487701416016, "learning_rate": 8.365195226509096e-06, "loss": 1.2625, "step": 2370 }, { "epoch": 0.5, "grad_norm": 1.2881660461425781, "learning_rate": 8.353609083536092e-06, "loss": 1.2886, "step": 2380 }, { "epoch": 0.5, "grad_norm": 1.3712249994277954, "learning_rate": 8.342022940563087e-06, "loss": 1.2708, "step": 2390 }, { "epoch": 0.5, "grad_norm": 1.2772799730300903, "learning_rate": 8.330436797590082e-06, "loss": 1.2669, "step": 2400 }, { "epoch": 0.5, "grad_norm": 1.2048228979110718, "learning_rate": 8.31885065461708e-06, "loss": 1.2543, "step": 2410 }, { "epoch": 0.5, "grad_norm": 1.4433040618896484, "learning_rate": 8.307264511644074e-06, "loss": 1.2656, "step": 2420 }, { "epoch": 0.51, "grad_norm": 1.366408109664917, "learning_rate": 8.295678368671069e-06, "loss": 1.2384, "step": 2430 }, { "epoch": 0.51, "grad_norm": 1.311207890510559, "learning_rate": 8.284092225698065e-06, "loss": 1.2826, "step": 2440 }, { "epoch": 0.51, "grad_norm": 1.3012316226959229, "learning_rate": 8.272506082725062e-06, "loss": 1.2813, "step": 2450 }, { "epoch": 0.51, "grad_norm": 1.3766871690750122, "learning_rate": 8.260919939752056e-06, "loss": 1.2777, "step": 2460 }, { "epoch": 0.52, "grad_norm": 1.328155279159546, "learning_rate": 8.249333796779053e-06, "loss": 1.2778, "step": 2470 }, { "epoch": 0.52, "grad_norm": 1.354087471961975, "learning_rate": 8.237747653806049e-06, "loss": 1.2766, "step": 2480 }, { "epoch": 0.52, "grad_norm": 1.2256219387054443, "learning_rate": 8.226161510833044e-06, "loss": 1.2542, "step": 2490 }, { "epoch": 0.52, "grad_norm": 1.453528881072998, "learning_rate": 8.21457536786004e-06, "loss": 1.2778, "step": 2500 }, { "epoch": 0.52, "grad_norm": 1.4232090711593628, "learning_rate": 8.202989224887037e-06, "loss": 1.2963, "step": 2510 }, { "epoch": 0.53, "grad_norm": 1.3284271955490112, "learning_rate": 8.191403081914031e-06, "loss": 1.27, "step": 2520 }, { "epoch": 0.53, "grad_norm": 1.4262343645095825, "learning_rate": 8.179816938941028e-06, "loss": 1.2728, "step": 2530 }, { "epoch": 0.53, "grad_norm": 1.3023653030395508, "learning_rate": 8.168230795968022e-06, "loss": 1.2658, "step": 2540 }, { "epoch": 0.53, "grad_norm": 1.3244940042495728, "learning_rate": 8.156644652995019e-06, "loss": 1.2631, "step": 2550 }, { "epoch": 0.53, "grad_norm": 1.3604815006256104, "learning_rate": 8.145058510022015e-06, "loss": 1.2793, "step": 2560 }, { "epoch": 0.54, "grad_norm": 1.3496869802474976, "learning_rate": 8.13347236704901e-06, "loss": 1.2839, "step": 2570 }, { "epoch": 0.54, "grad_norm": 1.476453423500061, "learning_rate": 8.121886224076006e-06, "loss": 1.2556, "step": 2580 }, { "epoch": 0.54, "grad_norm": 1.4949761629104614, "learning_rate": 8.110300081103002e-06, "loss": 1.2769, "step": 2590 }, { "epoch": 0.54, "grad_norm": 1.2385618686676025, "learning_rate": 8.098713938129997e-06, "loss": 1.2714, "step": 2600 }, { "epoch": 0.54, "grad_norm": 1.3485174179077148, "learning_rate": 8.087127795156992e-06, "loss": 1.2749, "step": 2610 }, { "epoch": 0.55, "grad_norm": 1.4038894176483154, "learning_rate": 8.07554165218399e-06, "loss": 1.2733, "step": 2620 }, { "epoch": 0.55, "grad_norm": 1.3258107900619507, "learning_rate": 8.063955509210985e-06, "loss": 1.2767, "step": 2630 }, { "epoch": 0.55, "grad_norm": 1.3624732494354248, "learning_rate": 8.05236936623798e-06, "loss": 1.2651, "step": 2640 }, { "epoch": 0.55, "grad_norm": 1.4360556602478027, "learning_rate": 8.040783223264976e-06, "loss": 1.2519, "step": 2650 }, { "epoch": 0.55, "grad_norm": 1.4374834299087524, "learning_rate": 8.029197080291972e-06, "loss": 1.2664, "step": 2660 }, { "epoch": 0.56, "grad_norm": 1.4204131364822388, "learning_rate": 8.017610937318967e-06, "loss": 1.271, "step": 2670 }, { "epoch": 0.56, "grad_norm": 1.3522765636444092, "learning_rate": 8.006024794345963e-06, "loss": 1.2499, "step": 2680 }, { "epoch": 0.56, "grad_norm": 1.373672604560852, "learning_rate": 7.99443865137296e-06, "loss": 1.2821, "step": 2690 }, { "epoch": 0.56, "grad_norm": 1.474179983139038, "learning_rate": 7.982852508399954e-06, "loss": 1.2627, "step": 2700 }, { "epoch": 0.57, "grad_norm": 1.4560604095458984, "learning_rate": 7.971266365426949e-06, "loss": 1.2479, "step": 2710 }, { "epoch": 0.57, "grad_norm": 1.3803871870040894, "learning_rate": 7.959680222453947e-06, "loss": 1.2693, "step": 2720 }, { "epoch": 0.57, "grad_norm": 1.2998815774917603, "learning_rate": 7.948094079480941e-06, "loss": 1.2477, "step": 2730 }, { "epoch": 0.57, "grad_norm": 1.3741968870162964, "learning_rate": 7.936507936507936e-06, "loss": 1.25, "step": 2740 }, { "epoch": 0.57, "grad_norm": 1.2343312501907349, "learning_rate": 7.924921793534933e-06, "loss": 1.2628, "step": 2750 }, { "epoch": 0.58, "grad_norm": 1.3123588562011719, "learning_rate": 7.913335650561929e-06, "loss": 1.2817, "step": 2760 }, { "epoch": 0.58, "grad_norm": 1.3881012201309204, "learning_rate": 7.901749507588924e-06, "loss": 1.2461, "step": 2770 }, { "epoch": 0.58, "grad_norm": 1.283443808555603, "learning_rate": 7.89016336461592e-06, "loss": 1.2736, "step": 2780 }, { "epoch": 0.58, "grad_norm": 1.3242292404174805, "learning_rate": 7.878577221642916e-06, "loss": 1.2536, "step": 2790 }, { "epoch": 0.58, "grad_norm": 1.3034474849700928, "learning_rate": 7.866991078669911e-06, "loss": 1.2494, "step": 2800 }, { "epoch": 0.59, "grad_norm": 1.243727445602417, "learning_rate": 7.855404935696907e-06, "loss": 1.2427, "step": 2810 }, { "epoch": 0.59, "grad_norm": 1.3450164794921875, "learning_rate": 7.843818792723902e-06, "loss": 1.2643, "step": 2820 }, { "epoch": 0.59, "grad_norm": 1.4294393062591553, "learning_rate": 7.832232649750898e-06, "loss": 1.262, "step": 2830 }, { "epoch": 0.59, "grad_norm": 1.413553237915039, "learning_rate": 7.820646506777895e-06, "loss": 1.2547, "step": 2840 }, { "epoch": 0.59, "grad_norm": 1.3107550144195557, "learning_rate": 7.80906036380489e-06, "loss": 1.2387, "step": 2850 }, { "epoch": 0.6, "grad_norm": 1.3627985715866089, "learning_rate": 7.797474220831886e-06, "loss": 1.2441, "step": 2860 }, { "epoch": 0.6, "grad_norm": 1.2843525409698486, "learning_rate": 7.785888077858882e-06, "loss": 1.2678, "step": 2870 }, { "epoch": 0.6, "grad_norm": 1.1956937313079834, "learning_rate": 7.774301934885877e-06, "loss": 1.2604, "step": 2880 }, { "epoch": 0.6, "grad_norm": 1.4336110353469849, "learning_rate": 7.762715791912873e-06, "loss": 1.2554, "step": 2890 }, { "epoch": 0.6, "grad_norm": 1.223803997039795, "learning_rate": 7.75112964893987e-06, "loss": 1.2484, "step": 2900 }, { "epoch": 0.61, "grad_norm": 1.4403553009033203, "learning_rate": 7.739543505966864e-06, "loss": 1.255, "step": 2910 }, { "epoch": 0.61, "grad_norm": 1.351047396659851, "learning_rate": 7.727957362993859e-06, "loss": 1.2353, "step": 2920 }, { "epoch": 0.61, "grad_norm": 1.5024089813232422, "learning_rate": 7.716371220020855e-06, "loss": 1.2607, "step": 2930 }, { "epoch": 0.61, "grad_norm": 1.299880027770996, "learning_rate": 7.704785077047852e-06, "loss": 1.2835, "step": 2940 }, { "epoch": 0.62, "grad_norm": 1.4588313102722168, "learning_rate": 7.693198934074846e-06, "loss": 1.2534, "step": 2950 }, { "epoch": 0.62, "grad_norm": 1.3417565822601318, "learning_rate": 7.681612791101843e-06, "loss": 1.2288, "step": 2960 }, { "epoch": 0.62, "grad_norm": 1.3168967962265015, "learning_rate": 7.67002664812884e-06, "loss": 1.2291, "step": 2970 }, { "epoch": 0.62, "grad_norm": 1.223056674003601, "learning_rate": 7.658440505155834e-06, "loss": 1.2566, "step": 2980 }, { "epoch": 0.62, "grad_norm": 1.3580880165100098, "learning_rate": 7.64685436218283e-06, "loss": 1.2644, "step": 2990 }, { "epoch": 0.63, "grad_norm": 1.3440959453582764, "learning_rate": 7.635268219209827e-06, "loss": 1.2476, "step": 3000 }, { "epoch": 0.63, "eval_loss": 1.2491159439086914, "eval_runtime": 60.2595, "eval_samples_per_second": 102.888, "eval_steps_per_second": 12.861, "step": 3000 }, { "epoch": 0.63, "grad_norm": 1.398789882659912, "learning_rate": 7.623682076236821e-06, "loss": 1.2305, "step": 3010 }, { "epoch": 0.63, "grad_norm": 1.2427514791488647, "learning_rate": 7.612095933263817e-06, "loss": 1.2414, "step": 3020 }, { "epoch": 0.63, "grad_norm": 1.2460520267486572, "learning_rate": 7.600509790290813e-06, "loss": 1.2453, "step": 3030 }, { "epoch": 0.63, "grad_norm": 1.310631513595581, "learning_rate": 7.588923647317809e-06, "loss": 1.2483, "step": 3040 }, { "epoch": 0.64, "grad_norm": 1.408805012702942, "learning_rate": 7.577337504344804e-06, "loss": 1.2074, "step": 3050 }, { "epoch": 0.64, "grad_norm": 1.334288239479065, "learning_rate": 7.5657513613718006e-06, "loss": 1.251, "step": 3060 }, { "epoch": 0.64, "grad_norm": 1.305930733680725, "learning_rate": 7.554165218398796e-06, "loss": 1.251, "step": 3070 }, { "epoch": 0.64, "grad_norm": 1.4388633966445923, "learning_rate": 7.542579075425791e-06, "loss": 1.2508, "step": 3080 }, { "epoch": 0.64, "grad_norm": 1.370788812637329, "learning_rate": 7.530992932452787e-06, "loss": 1.2571, "step": 3090 }, { "epoch": 0.65, "grad_norm": 1.303532600402832, "learning_rate": 7.519406789479783e-06, "loss": 1.269, "step": 3100 }, { "epoch": 0.65, "grad_norm": 1.3756309747695923, "learning_rate": 7.507820646506778e-06, "loss": 1.2607, "step": 3110 }, { "epoch": 0.65, "grad_norm": 1.3260369300842285, "learning_rate": 7.4962345035337746e-06, "loss": 1.2225, "step": 3120 }, { "epoch": 0.65, "grad_norm": 1.414718508720398, "learning_rate": 7.48464836056077e-06, "loss": 1.2436, "step": 3130 }, { "epoch": 0.65, "grad_norm": 1.2363390922546387, "learning_rate": 7.473062217587766e-06, "loss": 1.2376, "step": 3140 }, { "epoch": 0.66, "grad_norm": 1.259230613708496, "learning_rate": 7.461476074614762e-06, "loss": 1.2502, "step": 3150 }, { "epoch": 0.66, "grad_norm": 1.397451400756836, "learning_rate": 7.4498899316417575e-06, "loss": 1.2514, "step": 3160 }, { "epoch": 0.66, "grad_norm": 1.3317039012908936, "learning_rate": 7.438303788668752e-06, "loss": 1.2483, "step": 3170 }, { "epoch": 0.66, "grad_norm": 1.3065232038497925, "learning_rate": 7.426717645695749e-06, "loss": 1.2455, "step": 3180 }, { "epoch": 0.67, "grad_norm": 1.4810526371002197, "learning_rate": 7.415131502722744e-06, "loss": 1.2716, "step": 3190 }, { "epoch": 0.67, "grad_norm": 1.314434289932251, "learning_rate": 7.40354535974974e-06, "loss": 1.212, "step": 3200 }, { "epoch": 0.67, "grad_norm": 1.2311912775039673, "learning_rate": 7.391959216776735e-06, "loss": 1.2214, "step": 3210 }, { "epoch": 0.67, "grad_norm": 1.2313491106033325, "learning_rate": 7.3803730738037315e-06, "loss": 1.223, "step": 3220 }, { "epoch": 0.67, "grad_norm": 1.4492835998535156, "learning_rate": 7.368786930830727e-06, "loss": 1.2434, "step": 3230 }, { "epoch": 0.68, "grad_norm": 1.3827974796295166, "learning_rate": 7.3572007878577225e-06, "loss": 1.2159, "step": 3240 }, { "epoch": 0.68, "grad_norm": 1.3691293001174927, "learning_rate": 7.345614644884719e-06, "loss": 1.2542, "step": 3250 }, { "epoch": 0.68, "grad_norm": 1.5888055562973022, "learning_rate": 7.3340285019117144e-06, "loss": 1.2334, "step": 3260 }, { "epoch": 0.68, "grad_norm": 1.413843035697937, "learning_rate": 7.322442358938709e-06, "loss": 1.2479, "step": 3270 }, { "epoch": 0.68, "grad_norm": 1.2851001024246216, "learning_rate": 7.310856215965706e-06, "loss": 1.2389, "step": 3280 }, { "epoch": 0.69, "grad_norm": 1.4129613637924194, "learning_rate": 7.299270072992701e-06, "loss": 1.2397, "step": 3290 }, { "epoch": 0.69, "grad_norm": 1.327528953552246, "learning_rate": 7.2876839300196965e-06, "loss": 1.2404, "step": 3300 }, { "epoch": 0.69, "grad_norm": 1.3374613523483276, "learning_rate": 7.276097787046693e-06, "loss": 1.2442, "step": 3310 }, { "epoch": 0.69, "grad_norm": 1.4098230600357056, "learning_rate": 7.2645116440736884e-06, "loss": 1.2434, "step": 3320 }, { "epoch": 0.69, "grad_norm": 1.359785795211792, "learning_rate": 7.252925501100684e-06, "loss": 1.24, "step": 3330 }, { "epoch": 0.7, "grad_norm": 1.5303146839141846, "learning_rate": 7.24133935812768e-06, "loss": 1.2418, "step": 3340 }, { "epoch": 0.7, "grad_norm": 1.2719566822052002, "learning_rate": 7.229753215154676e-06, "loss": 1.2352, "step": 3350 }, { "epoch": 0.7, "grad_norm": 1.3264471292495728, "learning_rate": 7.218167072181671e-06, "loss": 1.2487, "step": 3360 }, { "epoch": 0.7, "grad_norm": 1.4324239492416382, "learning_rate": 7.206580929208668e-06, "loss": 1.2324, "step": 3370 }, { "epoch": 0.7, "grad_norm": 1.4195739030838013, "learning_rate": 7.1949947862356624e-06, "loss": 1.243, "step": 3380 }, { "epoch": 0.71, "grad_norm": 1.2976794242858887, "learning_rate": 7.183408643262658e-06, "loss": 1.245, "step": 3390 }, { "epoch": 0.71, "grad_norm": 1.2134556770324707, "learning_rate": 7.171822500289654e-06, "loss": 1.2501, "step": 3400 }, { "epoch": 0.71, "grad_norm": 1.4034926891326904, "learning_rate": 7.16023635731665e-06, "loss": 1.2501, "step": 3410 }, { "epoch": 0.71, "grad_norm": 1.2896411418914795, "learning_rate": 7.148650214343645e-06, "loss": 1.2168, "step": 3420 }, { "epoch": 0.72, "grad_norm": 1.2246516942977905, "learning_rate": 7.137064071370642e-06, "loss": 1.2563, "step": 3430 }, { "epoch": 0.72, "grad_norm": 1.428499698638916, "learning_rate": 7.125477928397637e-06, "loss": 1.2749, "step": 3440 }, { "epoch": 0.72, "grad_norm": 1.279649257659912, "learning_rate": 7.113891785424633e-06, "loss": 1.2473, "step": 3450 }, { "epoch": 0.72, "grad_norm": 1.3014793395996094, "learning_rate": 7.102305642451628e-06, "loss": 1.238, "step": 3460 }, { "epoch": 0.72, "grad_norm": 1.2327125072479248, "learning_rate": 7.090719499478625e-06, "loss": 1.2499, "step": 3470 }, { "epoch": 0.73, "grad_norm": 1.2716156244277954, "learning_rate": 7.079133356505619e-06, "loss": 1.239, "step": 3480 }, { "epoch": 0.73, "grad_norm": 1.3892238140106201, "learning_rate": 7.067547213532615e-06, "loss": 1.2453, "step": 3490 }, { "epoch": 0.73, "grad_norm": 1.3814228773117065, "learning_rate": 7.055961070559611e-06, "loss": 1.2483, "step": 3500 }, { "epoch": 0.73, "grad_norm": 1.327527403831482, "learning_rate": 7.044374927586607e-06, "loss": 1.2408, "step": 3510 }, { "epoch": 0.73, "grad_norm": 1.359143614768982, "learning_rate": 7.032788784613602e-06, "loss": 1.2401, "step": 3520 }, { "epoch": 0.74, "grad_norm": 1.2862249612808228, "learning_rate": 7.021202641640599e-06, "loss": 1.2587, "step": 3530 }, { "epoch": 0.74, "grad_norm": 1.3844903707504272, "learning_rate": 7.009616498667594e-06, "loss": 1.2313, "step": 3540 }, { "epoch": 0.74, "grad_norm": 1.318165898323059, "learning_rate": 6.99803035569459e-06, "loss": 1.2517, "step": 3550 }, { "epoch": 0.74, "grad_norm": 1.3430544137954712, "learning_rate": 6.986444212721586e-06, "loss": 1.2351, "step": 3560 }, { "epoch": 0.74, "grad_norm": 1.3766462802886963, "learning_rate": 6.974858069748582e-06, "loss": 1.2374, "step": 3570 }, { "epoch": 0.75, "grad_norm": 1.3596174716949463, "learning_rate": 6.963271926775576e-06, "loss": 1.2463, "step": 3580 }, { "epoch": 0.75, "grad_norm": 1.2986698150634766, "learning_rate": 6.951685783802573e-06, "loss": 1.245, "step": 3590 }, { "epoch": 0.75, "grad_norm": 1.3160896301269531, "learning_rate": 6.940099640829568e-06, "loss": 1.2205, "step": 3600 }, { "epoch": 0.75, "grad_norm": 1.2758549451828003, "learning_rate": 6.928513497856564e-06, "loss": 1.2263, "step": 3610 }, { "epoch": 0.75, "grad_norm": 1.3435688018798828, "learning_rate": 6.91692735488356e-06, "loss": 1.2054, "step": 3620 }, { "epoch": 0.76, "grad_norm": 1.3898662328720093, "learning_rate": 6.905341211910556e-06, "loss": 1.2563, "step": 3630 }, { "epoch": 0.76, "grad_norm": 1.3642399311065674, "learning_rate": 6.893755068937551e-06, "loss": 1.1986, "step": 3640 }, { "epoch": 0.76, "grad_norm": 1.3172852993011475, "learning_rate": 6.8821689259645475e-06, "loss": 1.2369, "step": 3650 }, { "epoch": 0.76, "grad_norm": 1.2481634616851807, "learning_rate": 6.870582782991543e-06, "loss": 1.2256, "step": 3660 }, { "epoch": 0.77, "grad_norm": 1.3152412176132202, "learning_rate": 6.8589966400185386e-06, "loss": 1.2692, "step": 3670 }, { "epoch": 0.77, "grad_norm": 1.3837155103683472, "learning_rate": 6.847410497045535e-06, "loss": 1.2395, "step": 3680 }, { "epoch": 0.77, "grad_norm": 1.1218920946121216, "learning_rate": 6.83582435407253e-06, "loss": 1.2387, "step": 3690 }, { "epoch": 0.77, "grad_norm": 1.3420273065567017, "learning_rate": 6.824238211099525e-06, "loss": 1.257, "step": 3700 }, { "epoch": 0.77, "grad_norm": 1.4104361534118652, "learning_rate": 6.812652068126521e-06, "loss": 1.2445, "step": 3710 }, { "epoch": 0.78, "grad_norm": 1.2503465414047241, "learning_rate": 6.801065925153517e-06, "loss": 1.2041, "step": 3720 }, { "epoch": 0.78, "grad_norm": 1.4227067232131958, "learning_rate": 6.7894797821805126e-06, "loss": 1.244, "step": 3730 }, { "epoch": 0.78, "grad_norm": 1.4678360223770142, "learning_rate": 6.777893639207508e-06, "loss": 1.2477, "step": 3740 }, { "epoch": 0.78, "grad_norm": 1.2533615827560425, "learning_rate": 6.7663074962345045e-06, "loss": 1.2232, "step": 3750 }, { "epoch": 0.78, "grad_norm": 1.3526480197906494, "learning_rate": 6.7547213532615e-06, "loss": 1.2248, "step": 3760 }, { "epoch": 0.79, "grad_norm": 1.2819933891296387, "learning_rate": 6.743135210288495e-06, "loss": 1.2299, "step": 3770 }, { "epoch": 0.79, "grad_norm": 1.2762075662612915, "learning_rate": 6.731549067315492e-06, "loss": 1.2518, "step": 3780 }, { "epoch": 0.79, "grad_norm": 1.3507285118103027, "learning_rate": 6.7199629243424866e-06, "loss": 1.2286, "step": 3790 }, { "epoch": 0.79, "grad_norm": 1.3762954473495483, "learning_rate": 6.708376781369482e-06, "loss": 1.2525, "step": 3800 }, { "epoch": 0.79, "grad_norm": 1.4141082763671875, "learning_rate": 6.6967906383964785e-06, "loss": 1.2346, "step": 3810 }, { "epoch": 0.8, "grad_norm": 1.3597639799118042, "learning_rate": 6.685204495423474e-06, "loss": 1.241, "step": 3820 }, { "epoch": 0.8, "grad_norm": 1.4315990209579468, "learning_rate": 6.6736183524504695e-06, "loss": 1.2288, "step": 3830 }, { "epoch": 0.8, "grad_norm": 1.4011619091033936, "learning_rate": 6.662032209477466e-06, "loss": 1.2436, "step": 3840 }, { "epoch": 0.8, "grad_norm": 1.160510778427124, "learning_rate": 6.650446066504461e-06, "loss": 1.2224, "step": 3850 }, { "epoch": 0.8, "grad_norm": 1.2349698543548584, "learning_rate": 6.638859923531457e-06, "loss": 1.2256, "step": 3860 }, { "epoch": 0.81, "grad_norm": 1.3648158311843872, "learning_rate": 6.627273780558453e-06, "loss": 1.2215, "step": 3870 }, { "epoch": 0.81, "grad_norm": 1.3032734394073486, "learning_rate": 6.615687637585449e-06, "loss": 1.2085, "step": 3880 }, { "epoch": 0.81, "grad_norm": 1.414401650428772, "learning_rate": 6.6041014946124435e-06, "loss": 1.2434, "step": 3890 }, { "epoch": 0.81, "grad_norm": 1.3182495832443237, "learning_rate": 6.59251535163944e-06, "loss": 1.2393, "step": 3900 }, { "epoch": 0.82, "grad_norm": 1.3610124588012695, "learning_rate": 6.580929208666435e-06, "loss": 1.2338, "step": 3910 }, { "epoch": 0.82, "grad_norm": 1.4505993127822876, "learning_rate": 6.569343065693431e-06, "loss": 1.247, "step": 3920 }, { "epoch": 0.82, "grad_norm": 1.2760661840438843, "learning_rate": 6.557756922720427e-06, "loss": 1.2333, "step": 3930 }, { "epoch": 0.82, "grad_norm": 1.301107406616211, "learning_rate": 6.546170779747423e-06, "loss": 1.2121, "step": 3940 }, { "epoch": 0.82, "grad_norm": 1.3670973777770996, "learning_rate": 6.534584636774418e-06, "loss": 1.227, "step": 3950 }, { "epoch": 0.83, "grad_norm": 1.4468539953231812, "learning_rate": 6.522998493801415e-06, "loss": 1.2407, "step": 3960 }, { "epoch": 0.83, "grad_norm": 1.4032971858978271, "learning_rate": 6.51141235082841e-06, "loss": 1.2277, "step": 3970 }, { "epoch": 0.83, "grad_norm": 1.357784390449524, "learning_rate": 6.499826207855405e-06, "loss": 1.2354, "step": 3980 }, { "epoch": 0.83, "grad_norm": 1.3305027484893799, "learning_rate": 6.4882400648824004e-06, "loss": 1.2461, "step": 3990 }, { "epoch": 0.83, "grad_norm": 1.3124698400497437, "learning_rate": 6.476653921909397e-06, "loss": 1.2154, "step": 4000 }, { "epoch": 0.83, "eval_loss": 1.2254222631454468, "eval_runtime": 60.262, "eval_samples_per_second": 102.884, "eval_steps_per_second": 12.861, "step": 4000 }, { "epoch": 0.84, "grad_norm": 1.3544700145721436, "learning_rate": 6.465067778936392e-06, "loss": 1.2126, "step": 4010 }, { "epoch": 0.84, "grad_norm": 1.5063039064407349, "learning_rate": 6.453481635963388e-06, "loss": 1.242, "step": 4020 }, { "epoch": 0.84, "grad_norm": 1.3593188524246216, "learning_rate": 6.441895492990384e-06, "loss": 1.2336, "step": 4030 }, { "epoch": 0.84, "grad_norm": 1.2792909145355225, "learning_rate": 6.43030935001738e-06, "loss": 1.2166, "step": 4040 }, { "epoch": 0.84, "grad_norm": 1.3836535215377808, "learning_rate": 6.418723207044375e-06, "loss": 1.242, "step": 4050 }, { "epoch": 0.85, "grad_norm": 1.3111571073532104, "learning_rate": 6.407137064071372e-06, "loss": 1.2166, "step": 4060 }, { "epoch": 0.85, "grad_norm": 1.3556476831436157, "learning_rate": 6.395550921098367e-06, "loss": 1.2046, "step": 4070 }, { "epoch": 0.85, "grad_norm": 1.4717987775802612, "learning_rate": 6.383964778125362e-06, "loss": 1.2319, "step": 4080 }, { "epoch": 0.85, "grad_norm": 1.3269935846328735, "learning_rate": 6.372378635152359e-06, "loss": 1.2305, "step": 4090 }, { "epoch": 0.86, "grad_norm": 1.2894436120986938, "learning_rate": 6.360792492179354e-06, "loss": 1.2194, "step": 4100 }, { "epoch": 0.86, "grad_norm": 1.3850573301315308, "learning_rate": 6.349206349206349e-06, "loss": 1.2252, "step": 4110 }, { "epoch": 0.86, "grad_norm": 1.3874263763427734, "learning_rate": 6.337620206233346e-06, "loss": 1.21, "step": 4120 }, { "epoch": 0.86, "grad_norm": 1.3668591976165771, "learning_rate": 6.326034063260341e-06, "loss": 1.2083, "step": 4130 }, { "epoch": 0.86, "grad_norm": 1.2573745250701904, "learning_rate": 6.314447920287337e-06, "loss": 1.2335, "step": 4140 }, { "epoch": 0.87, "grad_norm": 1.2464340925216675, "learning_rate": 6.302861777314333e-06, "loss": 1.2337, "step": 4150 }, { "epoch": 0.87, "grad_norm": 1.3820133209228516, "learning_rate": 6.291275634341329e-06, "loss": 1.2125, "step": 4160 }, { "epoch": 0.87, "grad_norm": 1.3374152183532715, "learning_rate": 6.279689491368324e-06, "loss": 1.2191, "step": 4170 }, { "epoch": 0.87, "grad_norm": 1.3884360790252686, "learning_rate": 6.2681033483953205e-06, "loss": 1.2114, "step": 4180 }, { "epoch": 0.87, "grad_norm": 1.2888221740722656, "learning_rate": 6.256517205422315e-06, "loss": 1.2351, "step": 4190 }, { "epoch": 0.88, "grad_norm": 1.272840142250061, "learning_rate": 6.244931062449311e-06, "loss": 1.2001, "step": 4200 }, { "epoch": 0.88, "grad_norm": 1.3938418626785278, "learning_rate": 6.233344919476307e-06, "loss": 1.202, "step": 4210 }, { "epoch": 0.88, "grad_norm": 1.445888876914978, "learning_rate": 6.221758776503303e-06, "loss": 1.2098, "step": 4220 }, { "epoch": 0.88, "grad_norm": 1.320033311843872, "learning_rate": 6.210172633530298e-06, "loss": 1.1894, "step": 4230 }, { "epoch": 0.88, "grad_norm": 1.476035475730896, "learning_rate": 6.198586490557294e-06, "loss": 1.2006, "step": 4240 }, { "epoch": 0.89, "grad_norm": 1.3529783487319946, "learning_rate": 6.18700034758429e-06, "loss": 1.2424, "step": 4250 }, { "epoch": 0.89, "grad_norm": 1.3271926641464233, "learning_rate": 6.1754142046112855e-06, "loss": 1.2142, "step": 4260 }, { "epoch": 0.89, "grad_norm": 1.2901276350021362, "learning_rate": 6.163828061638281e-06, "loss": 1.2199, "step": 4270 }, { "epoch": 0.89, "grad_norm": 1.3919955492019653, "learning_rate": 6.152241918665277e-06, "loss": 1.23, "step": 4280 }, { "epoch": 0.89, "grad_norm": 1.247094750404358, "learning_rate": 6.140655775692272e-06, "loss": 1.2239, "step": 4290 }, { "epoch": 0.9, "grad_norm": 1.2860187292099, "learning_rate": 6.129069632719268e-06, "loss": 1.2363, "step": 4300 }, { "epoch": 0.9, "grad_norm": 1.3451370000839233, "learning_rate": 6.117483489746264e-06, "loss": 1.2448, "step": 4310 }, { "epoch": 0.9, "grad_norm": 1.3651721477508545, "learning_rate": 6.1058973467732595e-06, "loss": 1.2115, "step": 4320 }, { "epoch": 0.9, "grad_norm": 1.351797103881836, "learning_rate": 6.094311203800255e-06, "loss": 1.2122, "step": 4330 }, { "epoch": 0.91, "grad_norm": 1.3105823993682861, "learning_rate": 6.082725060827251e-06, "loss": 1.2185, "step": 4340 }, { "epoch": 0.91, "grad_norm": 1.2687830924987793, "learning_rate": 6.071138917854247e-06, "loss": 1.2127, "step": 4350 }, { "epoch": 0.91, "grad_norm": 1.3442779779434204, "learning_rate": 6.0595527748812425e-06, "loss": 1.2063, "step": 4360 }, { "epoch": 0.91, "grad_norm": 1.4820213317871094, "learning_rate": 6.047966631908239e-06, "loss": 1.2495, "step": 4370 }, { "epoch": 0.91, "grad_norm": 1.4442673921585083, "learning_rate": 6.036380488935234e-06, "loss": 1.2231, "step": 4380 }, { "epoch": 0.92, "grad_norm": 1.328535795211792, "learning_rate": 6.024794345962229e-06, "loss": 1.2015, "step": 4390 }, { "epoch": 0.92, "grad_norm": 1.2676218748092651, "learning_rate": 6.013208202989225e-06, "loss": 1.2337, "step": 4400 }, { "epoch": 0.92, "grad_norm": 1.3665215969085693, "learning_rate": 6.001622060016221e-06, "loss": 1.2256, "step": 4410 }, { "epoch": 0.92, "grad_norm": 1.399819016456604, "learning_rate": 5.9900359170432165e-06, "loss": 1.2025, "step": 4420 }, { "epoch": 0.92, "grad_norm": 1.2961410284042358, "learning_rate": 5.978449774070213e-06, "loss": 1.2154, "step": 4430 }, { "epoch": 0.93, "grad_norm": 1.279374122619629, "learning_rate": 5.966863631097208e-06, "loss": 1.2248, "step": 4440 }, { "epoch": 0.93, "grad_norm": 1.3612409830093384, "learning_rate": 5.955277488124204e-06, "loss": 1.2473, "step": 4450 }, { "epoch": 0.93, "grad_norm": 1.4078785181045532, "learning_rate": 5.9436913451512e-06, "loss": 1.1953, "step": 4460 }, { "epoch": 0.93, "grad_norm": 1.2474583387374878, "learning_rate": 5.932105202178196e-06, "loss": 1.2183, "step": 4470 }, { "epoch": 0.93, "grad_norm": 1.3541713953018188, "learning_rate": 5.920519059205191e-06, "loss": 1.2096, "step": 4480 }, { "epoch": 0.94, "grad_norm": 1.420148253440857, "learning_rate": 5.908932916232186e-06, "loss": 1.2091, "step": 4490 }, { "epoch": 0.94, "grad_norm": 1.2937144041061401, "learning_rate": 5.897346773259182e-06, "loss": 1.2188, "step": 4500 }, { "epoch": 0.94, "grad_norm": 1.4640212059020996, "learning_rate": 5.885760630286178e-06, "loss": 1.2019, "step": 4510 }, { "epoch": 0.94, "grad_norm": 1.3564051389694214, "learning_rate": 5.874174487313173e-06, "loss": 1.2241, "step": 4520 }, { "epoch": 0.94, "grad_norm": 1.3135415315628052, "learning_rate": 5.86258834434017e-06, "loss": 1.227, "step": 4530 }, { "epoch": 0.95, "grad_norm": 1.2767616510391235, "learning_rate": 5.851002201367165e-06, "loss": 1.228, "step": 4540 }, { "epoch": 0.95, "grad_norm": 1.318343162536621, "learning_rate": 5.839416058394161e-06, "loss": 1.2212, "step": 4550 }, { "epoch": 0.95, "grad_norm": 1.4655839204788208, "learning_rate": 5.827829915421157e-06, "loss": 1.2271, "step": 4560 }, { "epoch": 0.95, "grad_norm": 1.3469165563583374, "learning_rate": 5.816243772448153e-06, "loss": 1.2359, "step": 4570 }, { "epoch": 0.96, "grad_norm": 1.2466676235198975, "learning_rate": 5.804657629475147e-06, "loss": 1.2127, "step": 4580 }, { "epoch": 0.96, "grad_norm": 1.3727000951766968, "learning_rate": 5.793071486502145e-06, "loss": 1.2326, "step": 4590 }, { "epoch": 0.96, "grad_norm": 1.25077223777771, "learning_rate": 5.781485343529139e-06, "loss": 1.2383, "step": 4600 }, { "epoch": 0.96, "grad_norm": 1.2925761938095093, "learning_rate": 5.769899200556135e-06, "loss": 1.229, "step": 4610 }, { "epoch": 0.96, "grad_norm": 1.2797605991363525, "learning_rate": 5.758313057583131e-06, "loss": 1.1943, "step": 4620 }, { "epoch": 0.97, "grad_norm": 1.3911211490631104, "learning_rate": 5.746726914610127e-06, "loss": 1.2158, "step": 4630 }, { "epoch": 0.97, "grad_norm": 1.3184133768081665, "learning_rate": 5.735140771637122e-06, "loss": 1.2256, "step": 4640 }, { "epoch": 0.97, "grad_norm": 1.242498755455017, "learning_rate": 5.723554628664119e-06, "loss": 1.2232, "step": 4650 }, { "epoch": 0.97, "grad_norm": 1.2783172130584717, "learning_rate": 5.711968485691114e-06, "loss": 1.2151, "step": 4660 }, { "epoch": 0.97, "grad_norm": 1.1488908529281616, "learning_rate": 5.70038234271811e-06, "loss": 1.2222, "step": 4670 }, { "epoch": 0.98, "grad_norm": 1.2965818643569946, "learning_rate": 5.688796199745106e-06, "loss": 1.2135, "step": 4680 }, { "epoch": 0.98, "grad_norm": 1.4008314609527588, "learning_rate": 5.6772100567721015e-06, "loss": 1.21, "step": 4690 }, { "epoch": 0.98, "grad_norm": 1.3333206176757812, "learning_rate": 5.665623913799096e-06, "loss": 1.235, "step": 4700 }, { "epoch": 0.98, "grad_norm": 1.3453115224838257, "learning_rate": 5.654037770826093e-06, "loss": 1.2175, "step": 4710 }, { "epoch": 0.98, "grad_norm": 1.3863331079483032, "learning_rate": 5.642451627853088e-06, "loss": 1.2171, "step": 4720 }, { "epoch": 0.99, "grad_norm": 1.3186782598495483, "learning_rate": 5.630865484880084e-06, "loss": 1.2179, "step": 4730 }, { "epoch": 0.99, "grad_norm": 1.394921898841858, "learning_rate": 5.61927934190708e-06, "loss": 1.2374, "step": 4740 }, { "epoch": 0.99, "grad_norm": 1.2399722337722778, "learning_rate": 5.6076931989340755e-06, "loss": 1.201, "step": 4750 }, { "epoch": 0.99, "grad_norm": 1.2864711284637451, "learning_rate": 5.596107055961071e-06, "loss": 1.2222, "step": 4760 }, { "epoch": 0.99, "grad_norm": 1.3668363094329834, "learning_rate": 5.584520912988067e-06, "loss": 1.2037, "step": 4770 }, { "epoch": 1.0, "grad_norm": 1.4248427152633667, "learning_rate": 5.572934770015063e-06, "loss": 1.2148, "step": 4780 }, { "epoch": 1.0, "grad_norm": 1.3241183757781982, "learning_rate": 5.561348627042058e-06, "loss": 1.217, "step": 4790 }, { "epoch": 1.0, "grad_norm": 1.2951161861419678, "learning_rate": 5.549762484069053e-06, "loss": 1.2082, "step": 4800 }, { "epoch": 1.0, "grad_norm": 1.4023364782333374, "learning_rate": 5.5381763410960495e-06, "loss": 1.1816, "step": 4810 }, { "epoch": 1.01, "grad_norm": 1.3557953834533691, "learning_rate": 5.526590198123045e-06, "loss": 1.184, "step": 4820 }, { "epoch": 1.01, "grad_norm": 1.282929539680481, "learning_rate": 5.515004055150041e-06, "loss": 1.1849, "step": 4830 }, { "epoch": 1.01, "grad_norm": 1.3420430421829224, "learning_rate": 5.503417912177037e-06, "loss": 1.1783, "step": 4840 }, { "epoch": 1.01, "grad_norm": 1.4730277061462402, "learning_rate": 5.4918317692040325e-06, "loss": 1.1656, "step": 4850 }, { "epoch": 1.01, "grad_norm": 1.3748600482940674, "learning_rate": 5.480245626231028e-06, "loss": 1.2053, "step": 4860 }, { "epoch": 1.02, "grad_norm": 1.395768165588379, "learning_rate": 5.468659483258024e-06, "loss": 1.1576, "step": 4870 }, { "epoch": 1.02, "grad_norm": 1.3293030261993408, "learning_rate": 5.45707334028502e-06, "loss": 1.1857, "step": 4880 }, { "epoch": 1.02, "grad_norm": 1.3349307775497437, "learning_rate": 5.4454871973120146e-06, "loss": 1.1906, "step": 4890 }, { "epoch": 1.02, "grad_norm": 1.4291343688964844, "learning_rate": 5.433901054339012e-06, "loss": 1.1987, "step": 4900 }, { "epoch": 1.02, "grad_norm": 1.542372703552246, "learning_rate": 5.4223149113660065e-06, "loss": 1.1728, "step": 4910 }, { "epoch": 1.03, "grad_norm": 1.3494104146957397, "learning_rate": 5.410728768393002e-06, "loss": 1.1773, "step": 4920 }, { "epoch": 1.03, "grad_norm": 1.3300869464874268, "learning_rate": 5.399142625419998e-06, "loss": 1.1648, "step": 4930 }, { "epoch": 1.03, "grad_norm": 1.3492801189422607, "learning_rate": 5.387556482446994e-06, "loss": 1.1953, "step": 4940 }, { "epoch": 1.03, "grad_norm": 1.3387207984924316, "learning_rate": 5.375970339473989e-06, "loss": 1.1847, "step": 4950 }, { "epoch": 1.03, "grad_norm": 1.323564052581787, "learning_rate": 5.364384196500986e-06, "loss": 1.2085, "step": 4960 }, { "epoch": 1.04, "grad_norm": 1.3828566074371338, "learning_rate": 5.352798053527981e-06, "loss": 1.1617, "step": 4970 }, { "epoch": 1.04, "grad_norm": 1.4634027481079102, "learning_rate": 5.341211910554977e-06, "loss": 1.1663, "step": 4980 }, { "epoch": 1.04, "grad_norm": 1.2878636121749878, "learning_rate": 5.329625767581973e-06, "loss": 1.1809, "step": 4990 }, { "epoch": 1.04, "grad_norm": 1.2984651327133179, "learning_rate": 5.318039624608968e-06, "loss": 1.2281, "step": 5000 }, { "epoch": 1.04, "eval_loss": 1.2093191146850586, "eval_runtime": 60.3546, "eval_samples_per_second": 102.726, "eval_steps_per_second": 12.841, "step": 5000 }, { "epoch": 1.04, "grad_norm": 1.4071316719055176, "learning_rate": 5.306453481635963e-06, "loss": 1.1754, "step": 5010 }, { "epoch": 1.05, "grad_norm": 1.2421307563781738, "learning_rate": 5.294867338662959e-06, "loss": 1.1785, "step": 5020 }, { "epoch": 1.05, "grad_norm": 1.3964232206344604, "learning_rate": 5.283281195689955e-06, "loss": 1.1833, "step": 5030 }, { "epoch": 1.05, "grad_norm": 1.4068059921264648, "learning_rate": 5.271695052716951e-06, "loss": 1.1738, "step": 5040 }, { "epoch": 1.05, "grad_norm": 1.273362398147583, "learning_rate": 5.260108909743946e-06, "loss": 1.1752, "step": 5050 }, { "epoch": 1.06, "grad_norm": 1.3210726976394653, "learning_rate": 5.248522766770943e-06, "loss": 1.1819, "step": 5060 }, { "epoch": 1.06, "grad_norm": 1.339066743850708, "learning_rate": 5.236936623797938e-06, "loss": 1.1803, "step": 5070 }, { "epoch": 1.06, "grad_norm": 1.2689433097839355, "learning_rate": 5.225350480824933e-06, "loss": 1.1899, "step": 5080 }, { "epoch": 1.06, "grad_norm": 1.339124083518982, "learning_rate": 5.21376433785193e-06, "loss": 1.1743, "step": 5090 }, { "epoch": 1.06, "grad_norm": 1.321183681488037, "learning_rate": 5.202178194878925e-06, "loss": 1.1832, "step": 5100 }, { "epoch": 1.07, "grad_norm": 1.3904688358306885, "learning_rate": 5.19059205190592e-06, "loss": 1.1951, "step": 5110 }, { "epoch": 1.07, "grad_norm": 1.421046495437622, "learning_rate": 5.179005908932917e-06, "loss": 1.1841, "step": 5120 }, { "epoch": 1.07, "grad_norm": 1.26221764087677, "learning_rate": 5.167419765959912e-06, "loss": 1.1932, "step": 5130 }, { "epoch": 1.07, "grad_norm": 1.5197116136550903, "learning_rate": 5.155833622986908e-06, "loss": 1.177, "step": 5140 }, { "epoch": 1.07, "grad_norm": 1.3857975006103516, "learning_rate": 5.144247480013904e-06, "loss": 1.1803, "step": 5150 }, { "epoch": 1.08, "grad_norm": 1.3418445587158203, "learning_rate": 5.1326613370409e-06, "loss": 1.1969, "step": 5160 }, { "epoch": 1.08, "grad_norm": 1.4151560068130493, "learning_rate": 5.121075194067895e-06, "loss": 1.179, "step": 5170 }, { "epoch": 1.08, "grad_norm": 1.4019502401351929, "learning_rate": 5.1094890510948916e-06, "loss": 1.1797, "step": 5180 }, { "epoch": 1.08, "grad_norm": 1.3561010360717773, "learning_rate": 5.097902908121887e-06, "loss": 1.1906, "step": 5190 }, { "epoch": 1.08, "grad_norm": 1.344114899635315, "learning_rate": 5.086316765148882e-06, "loss": 1.1851, "step": 5200 }, { "epoch": 1.09, "grad_norm": 1.3627663850784302, "learning_rate": 5.074730622175878e-06, "loss": 1.154, "step": 5210 }, { "epoch": 1.09, "grad_norm": 1.3944242000579834, "learning_rate": 5.063144479202874e-06, "loss": 1.1701, "step": 5220 }, { "epoch": 1.09, "grad_norm": 1.3660093545913696, "learning_rate": 5.051558336229869e-06, "loss": 1.1861, "step": 5230 }, { "epoch": 1.09, "grad_norm": 1.2477737665176392, "learning_rate": 5.0399721932568656e-06, "loss": 1.1917, "step": 5240 }, { "epoch": 1.09, "grad_norm": 1.381079912185669, "learning_rate": 5.028386050283861e-06, "loss": 1.176, "step": 5250 }, { "epoch": 1.1, "grad_norm": 1.277708649635315, "learning_rate": 5.016799907310857e-06, "loss": 1.2262, "step": 5260 }, { "epoch": 1.1, "grad_norm": 1.4118417501449585, "learning_rate": 5.005213764337853e-06, "loss": 1.194, "step": 5270 }, { "epoch": 1.1, "grad_norm": 1.283445119857788, "learning_rate": 4.9936276213648485e-06, "loss": 1.1641, "step": 5280 }, { "epoch": 1.1, "grad_norm": 1.3364849090576172, "learning_rate": 4.982041478391844e-06, "loss": 1.1904, "step": 5290 }, { "epoch": 1.11, "grad_norm": 1.3707315921783447, "learning_rate": 4.9704553354188395e-06, "loss": 1.2228, "step": 5300 }, { "epoch": 1.11, "grad_norm": 1.3826156854629517, "learning_rate": 4.958869192445835e-06, "loss": 1.1808, "step": 5310 }, { "epoch": 1.11, "grad_norm": 1.336873173713684, "learning_rate": 4.947283049472831e-06, "loss": 1.1765, "step": 5320 }, { "epoch": 1.11, "grad_norm": 1.1747794151306152, "learning_rate": 4.935696906499827e-06, "loss": 1.1805, "step": 5330 }, { "epoch": 1.11, "grad_norm": 1.4768750667572021, "learning_rate": 4.924110763526822e-06, "loss": 1.2086, "step": 5340 }, { "epoch": 1.12, "grad_norm": 1.5451874732971191, "learning_rate": 4.912524620553818e-06, "loss": 1.1714, "step": 5350 }, { "epoch": 1.12, "grad_norm": 1.420366644859314, "learning_rate": 4.9009384775808135e-06, "loss": 1.1848, "step": 5360 }, { "epoch": 1.12, "grad_norm": 1.3127976655960083, "learning_rate": 4.889352334607809e-06, "loss": 1.164, "step": 5370 }, { "epoch": 1.12, "grad_norm": 1.3515974283218384, "learning_rate": 4.8777661916348054e-06, "loss": 1.1703, "step": 5380 }, { "epoch": 1.12, "grad_norm": 1.3823376893997192, "learning_rate": 4.866180048661801e-06, "loss": 1.192, "step": 5390 }, { "epoch": 1.13, "grad_norm": 1.300921082496643, "learning_rate": 4.8545939056887965e-06, "loss": 1.1903, "step": 5400 }, { "epoch": 1.13, "grad_norm": 1.3634570837020874, "learning_rate": 4.843007762715792e-06, "loss": 1.1683, "step": 5410 }, { "epoch": 1.13, "grad_norm": 1.4684809446334839, "learning_rate": 4.831421619742788e-06, "loss": 1.1805, "step": 5420 }, { "epoch": 1.13, "grad_norm": 1.3468976020812988, "learning_rate": 4.819835476769784e-06, "loss": 1.1739, "step": 5430 }, { "epoch": 1.13, "grad_norm": 1.4344850778579712, "learning_rate": 4.8082493337967794e-06, "loss": 1.1568, "step": 5440 }, { "epoch": 1.14, "grad_norm": 1.379841685295105, "learning_rate": 4.796663190823776e-06, "loss": 1.2019, "step": 5450 }, { "epoch": 1.14, "grad_norm": 1.5211639404296875, "learning_rate": 4.7850770478507705e-06, "loss": 1.1752, "step": 5460 }, { "epoch": 1.14, "grad_norm": 1.3554086685180664, "learning_rate": 4.773490904877767e-06, "loss": 1.1694, "step": 5470 }, { "epoch": 1.14, "grad_norm": 1.3625938892364502, "learning_rate": 4.761904761904762e-06, "loss": 1.1554, "step": 5480 }, { "epoch": 1.14, "grad_norm": 1.2949503660202026, "learning_rate": 4.750318618931758e-06, "loss": 1.1843, "step": 5490 }, { "epoch": 1.15, "grad_norm": 1.2827516794204712, "learning_rate": 4.738732475958754e-06, "loss": 1.1819, "step": 5500 }, { "epoch": 1.15, "grad_norm": 1.2222222089767456, "learning_rate": 4.727146332985749e-06, "loss": 1.1943, "step": 5510 }, { "epoch": 1.15, "grad_norm": 1.419891357421875, "learning_rate": 4.715560190012745e-06, "loss": 1.1813, "step": 5520 }, { "epoch": 1.15, "grad_norm": 1.2174855470657349, "learning_rate": 4.703974047039741e-06, "loss": 1.1664, "step": 5530 }, { "epoch": 1.16, "grad_norm": 1.2947397232055664, "learning_rate": 4.692387904066736e-06, "loss": 1.1629, "step": 5540 }, { "epoch": 1.16, "grad_norm": 1.3705869913101196, "learning_rate": 4.680801761093732e-06, "loss": 1.1929, "step": 5550 }, { "epoch": 1.16, "grad_norm": 1.3978757858276367, "learning_rate": 4.669215618120728e-06, "loss": 1.1859, "step": 5560 }, { "epoch": 1.16, "grad_norm": 1.4621931314468384, "learning_rate": 4.657629475147724e-06, "loss": 1.1839, "step": 5570 }, { "epoch": 1.16, "grad_norm": 1.29670250415802, "learning_rate": 4.646043332174719e-06, "loss": 1.1709, "step": 5580 }, { "epoch": 1.17, "grad_norm": 1.2986425161361694, "learning_rate": 4.634457189201716e-06, "loss": 1.1616, "step": 5590 }, { "epoch": 1.17, "grad_norm": 1.3808799982070923, "learning_rate": 4.62287104622871e-06, "loss": 1.1606, "step": 5600 }, { "epoch": 1.17, "grad_norm": 1.2885197401046753, "learning_rate": 4.611284903255707e-06, "loss": 1.173, "step": 5610 }, { "epoch": 1.17, "grad_norm": 1.4067176580429077, "learning_rate": 4.599698760282702e-06, "loss": 1.1567, "step": 5620 }, { "epoch": 1.17, "grad_norm": 1.2855522632598877, "learning_rate": 4.588112617309698e-06, "loss": 1.1577, "step": 5630 }, { "epoch": 1.18, "grad_norm": 1.365801453590393, "learning_rate": 4.576526474336694e-06, "loss": 1.1897, "step": 5640 }, { "epoch": 1.18, "grad_norm": 1.3854248523712158, "learning_rate": 4.564940331363689e-06, "loss": 1.1714, "step": 5650 }, { "epoch": 1.18, "grad_norm": 1.288671851158142, "learning_rate": 4.553354188390685e-06, "loss": 1.1691, "step": 5660 }, { "epoch": 1.18, "grad_norm": 1.3368744850158691, "learning_rate": 4.541768045417681e-06, "loss": 1.1676, "step": 5670 }, { "epoch": 1.18, "grad_norm": 1.3967853784561157, "learning_rate": 4.530181902444676e-06, "loss": 1.1816, "step": 5680 }, { "epoch": 1.19, "grad_norm": 1.2882468700408936, "learning_rate": 4.518595759471673e-06, "loss": 1.1732, "step": 5690 }, { "epoch": 1.19, "grad_norm": 1.3411914110183716, "learning_rate": 4.507009616498668e-06, "loss": 1.1943, "step": 5700 }, { "epoch": 1.19, "grad_norm": 1.3860743045806885, "learning_rate": 4.495423473525664e-06, "loss": 1.1771, "step": 5710 }, { "epoch": 1.19, "grad_norm": 1.3196132183074951, "learning_rate": 4.483837330552659e-06, "loss": 1.1661, "step": 5720 }, { "epoch": 1.19, "grad_norm": 1.3677616119384766, "learning_rate": 4.4722511875796556e-06, "loss": 1.1815, "step": 5730 }, { "epoch": 1.2, "grad_norm": 1.4438225030899048, "learning_rate": 4.460665044606651e-06, "loss": 1.193, "step": 5740 }, { "epoch": 1.2, "grad_norm": 1.2998027801513672, "learning_rate": 4.449078901633647e-06, "loss": 1.1665, "step": 5750 }, { "epoch": 1.2, "grad_norm": 1.2200978994369507, "learning_rate": 4.437492758660642e-06, "loss": 1.1706, "step": 5760 }, { "epoch": 1.2, "grad_norm": 1.2530813217163086, "learning_rate": 4.425906615687638e-06, "loss": 1.1836, "step": 5770 }, { "epoch": 1.21, "grad_norm": 1.353306770324707, "learning_rate": 4.414320472714634e-06, "loss": 1.1792, "step": 5780 }, { "epoch": 1.21, "grad_norm": 1.3246160745620728, "learning_rate": 4.4027343297416296e-06, "loss": 1.1785, "step": 5790 }, { "epoch": 1.21, "grad_norm": 1.5756295919418335, "learning_rate": 4.391148186768625e-06, "loss": 1.207, "step": 5800 }, { "epoch": 1.21, "grad_norm": 1.3847354650497437, "learning_rate": 4.379562043795621e-06, "loss": 1.1711, "step": 5810 }, { "epoch": 1.21, "grad_norm": 1.2891080379486084, "learning_rate": 4.367975900822616e-06, "loss": 1.1649, "step": 5820 }, { "epoch": 1.22, "grad_norm": 1.4148257970809937, "learning_rate": 4.3563897578496125e-06, "loss": 1.1836, "step": 5830 }, { "epoch": 1.22, "grad_norm": 1.3125419616699219, "learning_rate": 4.344803614876608e-06, "loss": 1.1831, "step": 5840 }, { "epoch": 1.22, "grad_norm": 1.3058887720108032, "learning_rate": 4.3332174719036036e-06, "loss": 1.1615, "step": 5850 }, { "epoch": 1.22, "grad_norm": 1.2639198303222656, "learning_rate": 4.321631328930599e-06, "loss": 1.1642, "step": 5860 }, { "epoch": 1.22, "grad_norm": 1.4334591627120972, "learning_rate": 4.310045185957595e-06, "loss": 1.1628, "step": 5870 }, { "epoch": 1.23, "grad_norm": 1.3660703897476196, "learning_rate": 4.298459042984591e-06, "loss": 1.1861, "step": 5880 }, { "epoch": 1.23, "grad_norm": 1.3341001272201538, "learning_rate": 4.2868729000115865e-06, "loss": 1.1316, "step": 5890 }, { "epoch": 1.23, "grad_norm": 1.348137378692627, "learning_rate": 4.275286757038582e-06, "loss": 1.147, "step": 5900 }, { "epoch": 1.23, "grad_norm": 1.3110828399658203, "learning_rate": 4.2637006140655775e-06, "loss": 1.1877, "step": 5910 }, { "epoch": 1.23, "grad_norm": 1.3300223350524902, "learning_rate": 4.252114471092574e-06, "loss": 1.1824, "step": 5920 }, { "epoch": 1.24, "grad_norm": 1.299660086631775, "learning_rate": 4.2405283281195694e-06, "loss": 1.1811, "step": 5930 }, { "epoch": 1.24, "grad_norm": 1.3512821197509766, "learning_rate": 4.228942185146565e-06, "loss": 1.1575, "step": 5940 }, { "epoch": 1.24, "grad_norm": 1.4316037893295288, "learning_rate": 4.217356042173561e-06, "loss": 1.1925, "step": 5950 }, { "epoch": 1.24, "grad_norm": 1.266357183456421, "learning_rate": 4.205769899200556e-06, "loss": 1.1641, "step": 5960 }, { "epoch": 1.24, "grad_norm": 1.329677939414978, "learning_rate": 4.194183756227552e-06, "loss": 1.1879, "step": 5970 }, { "epoch": 1.25, "grad_norm": 1.4943017959594727, "learning_rate": 4.182597613254548e-06, "loss": 1.1731, "step": 5980 }, { "epoch": 1.25, "grad_norm": 1.2583523988723755, "learning_rate": 4.1710114702815434e-06, "loss": 1.1659, "step": 5990 }, { "epoch": 1.25, "grad_norm": 1.5405082702636719, "learning_rate": 4.15942532730854e-06, "loss": 1.1796, "step": 6000 }, { "epoch": 1.25, "eval_loss": 1.1978083848953247, "eval_runtime": 60.2518, "eval_samples_per_second": 102.901, "eval_steps_per_second": 12.863, "step": 6000 }, { "epoch": 1.25, "grad_norm": 1.3956575393676758, "learning_rate": 4.1478391843355345e-06, "loss": 1.1614, "step": 6010 }, { "epoch": 1.26, "grad_norm": 1.3546531200408936, "learning_rate": 4.136253041362531e-06, "loss": 1.1583, "step": 6020 }, { "epoch": 1.26, "grad_norm": 1.4522563219070435, "learning_rate": 4.124666898389526e-06, "loss": 1.191, "step": 6030 }, { "epoch": 1.26, "grad_norm": 1.442973017692566, "learning_rate": 4.113080755416522e-06, "loss": 1.1779, "step": 6040 }, { "epoch": 1.26, "grad_norm": 1.2976762056350708, "learning_rate": 4.101494612443518e-06, "loss": 1.1713, "step": 6050 }, { "epoch": 1.26, "grad_norm": 1.338387131690979, "learning_rate": 4.089908469470514e-06, "loss": 1.1736, "step": 6060 }, { "epoch": 1.27, "grad_norm": 1.4153416156768799, "learning_rate": 4.078322326497509e-06, "loss": 1.1773, "step": 6070 }, { "epoch": 1.27, "grad_norm": 1.4394385814666748, "learning_rate": 4.066736183524505e-06, "loss": 1.1923, "step": 6080 }, { "epoch": 1.27, "grad_norm": 1.3546093702316284, "learning_rate": 4.055150040551501e-06, "loss": 1.172, "step": 6090 }, { "epoch": 1.27, "grad_norm": 1.2290021181106567, "learning_rate": 4.043563897578496e-06, "loss": 1.1694, "step": 6100 }, { "epoch": 1.27, "grad_norm": 1.297818899154663, "learning_rate": 4.031977754605492e-06, "loss": 1.1683, "step": 6110 }, { "epoch": 1.28, "grad_norm": 1.3360967636108398, "learning_rate": 4.020391611632488e-06, "loss": 1.1645, "step": 6120 }, { "epoch": 1.28, "grad_norm": 1.4013689756393433, "learning_rate": 4.008805468659483e-06, "loss": 1.1583, "step": 6130 }, { "epoch": 1.28, "grad_norm": 1.3673937320709229, "learning_rate": 3.99721932568648e-06, "loss": 1.1418, "step": 6140 }, { "epoch": 1.28, "grad_norm": 1.4013173580169678, "learning_rate": 3.985633182713474e-06, "loss": 1.1562, "step": 6150 }, { "epoch": 1.28, "grad_norm": 1.23403799533844, "learning_rate": 3.974047039740471e-06, "loss": 1.1731, "step": 6160 }, { "epoch": 1.29, "grad_norm": 1.3346298933029175, "learning_rate": 3.962460896767466e-06, "loss": 1.1548, "step": 6170 }, { "epoch": 1.29, "grad_norm": 1.352409839630127, "learning_rate": 3.950874753794462e-06, "loss": 1.1872, "step": 6180 }, { "epoch": 1.29, "grad_norm": 1.491523265838623, "learning_rate": 3.939288610821458e-06, "loss": 1.1774, "step": 6190 }, { "epoch": 1.29, "grad_norm": 1.393288016319275, "learning_rate": 3.927702467848454e-06, "loss": 1.2024, "step": 6200 }, { "epoch": 1.3, "grad_norm": 1.455191969871521, "learning_rate": 3.916116324875449e-06, "loss": 1.1672, "step": 6210 }, { "epoch": 1.3, "grad_norm": 1.3734558820724487, "learning_rate": 3.904530181902445e-06, "loss": 1.1686, "step": 6220 }, { "epoch": 1.3, "grad_norm": 1.35550057888031, "learning_rate": 3.892944038929441e-06, "loss": 1.1663, "step": 6230 }, { "epoch": 1.3, "grad_norm": 1.2901463508605957, "learning_rate": 3.881357895956437e-06, "loss": 1.1938, "step": 6240 }, { "epoch": 1.3, "grad_norm": 1.272072196006775, "learning_rate": 3.869771752983432e-06, "loss": 1.1487, "step": 6250 }, { "epoch": 1.31, "grad_norm": 1.3195065259933472, "learning_rate": 3.858185610010428e-06, "loss": 1.14, "step": 6260 }, { "epoch": 1.31, "grad_norm": 1.3816322088241577, "learning_rate": 3.846599467037423e-06, "loss": 1.1517, "step": 6270 }, { "epoch": 1.31, "grad_norm": 1.2947728633880615, "learning_rate": 3.83501332406442e-06, "loss": 1.1686, "step": 6280 }, { "epoch": 1.31, "grad_norm": 1.3573113679885864, "learning_rate": 3.823427181091415e-06, "loss": 1.1961, "step": 6290 }, { "epoch": 1.31, "grad_norm": 1.3107391595840454, "learning_rate": 3.8118410381184106e-06, "loss": 1.1891, "step": 6300 }, { "epoch": 1.32, "grad_norm": 1.4028772115707397, "learning_rate": 3.8002548951454066e-06, "loss": 1.1514, "step": 6310 }, { "epoch": 1.32, "grad_norm": 1.3728693723678589, "learning_rate": 3.788668752172402e-06, "loss": 1.1378, "step": 6320 }, { "epoch": 1.32, "grad_norm": 1.3074133396148682, "learning_rate": 3.777082609199398e-06, "loss": 1.1887, "step": 6330 }, { "epoch": 1.32, "grad_norm": 1.4246951341629028, "learning_rate": 3.7654964662263936e-06, "loss": 1.1914, "step": 6340 }, { "epoch": 1.32, "grad_norm": 1.4968029260635376, "learning_rate": 3.753910323253389e-06, "loss": 1.1747, "step": 6350 }, { "epoch": 1.33, "grad_norm": 1.4184929132461548, "learning_rate": 3.742324180280385e-06, "loss": 1.1756, "step": 6360 }, { "epoch": 1.33, "grad_norm": 1.4123560190200806, "learning_rate": 3.730738037307381e-06, "loss": 1.1737, "step": 6370 }, { "epoch": 1.33, "grad_norm": 1.4032166004180908, "learning_rate": 3.719151894334376e-06, "loss": 1.1663, "step": 6380 }, { "epoch": 1.33, "grad_norm": 1.4489792585372925, "learning_rate": 3.707565751361372e-06, "loss": 1.1667, "step": 6390 }, { "epoch": 1.33, "grad_norm": 1.5236833095550537, "learning_rate": 3.6959796083883676e-06, "loss": 1.1548, "step": 6400 }, { "epoch": 1.34, "grad_norm": 1.4273195266723633, "learning_rate": 3.6843934654153635e-06, "loss": 1.1724, "step": 6410 }, { "epoch": 1.34, "grad_norm": 1.3556939363479614, "learning_rate": 3.6728073224423595e-06, "loss": 1.1646, "step": 6420 }, { "epoch": 1.34, "grad_norm": 1.4586809873580933, "learning_rate": 3.6612211794693546e-06, "loss": 1.1648, "step": 6430 }, { "epoch": 1.34, "grad_norm": 1.3400959968566895, "learning_rate": 3.6496350364963505e-06, "loss": 1.149, "step": 6440 }, { "epoch": 1.35, "grad_norm": 1.4101979732513428, "learning_rate": 3.6380488935233465e-06, "loss": 1.1516, "step": 6450 }, { "epoch": 1.35, "grad_norm": 1.3828833103179932, "learning_rate": 3.626462750550342e-06, "loss": 1.1609, "step": 6460 }, { "epoch": 1.35, "grad_norm": 1.4406893253326416, "learning_rate": 3.614876607577338e-06, "loss": 1.1927, "step": 6470 }, { "epoch": 1.35, "grad_norm": 1.3856877088546753, "learning_rate": 3.603290464604334e-06, "loss": 1.1618, "step": 6480 }, { "epoch": 1.35, "grad_norm": 1.5042824745178223, "learning_rate": 3.591704321631329e-06, "loss": 1.169, "step": 6490 }, { "epoch": 1.36, "grad_norm": 1.2347859144210815, "learning_rate": 3.580118178658325e-06, "loss": 1.1693, "step": 6500 }, { "epoch": 1.36, "grad_norm": 1.4108585119247437, "learning_rate": 3.568532035685321e-06, "loss": 1.1807, "step": 6510 }, { "epoch": 1.36, "grad_norm": 1.4458739757537842, "learning_rate": 3.5569458927123164e-06, "loss": 1.1801, "step": 6520 }, { "epoch": 1.36, "grad_norm": 1.3607912063598633, "learning_rate": 3.5453597497393123e-06, "loss": 1.1735, "step": 6530 }, { "epoch": 1.36, "grad_norm": 1.4772734642028809, "learning_rate": 3.5337736067663074e-06, "loss": 1.1798, "step": 6540 }, { "epoch": 1.37, "grad_norm": 1.3552745580673218, "learning_rate": 3.5221874637933034e-06, "loss": 1.1558, "step": 6550 }, { "epoch": 1.37, "grad_norm": 1.3368369340896606, "learning_rate": 3.5106013208202993e-06, "loss": 1.1902, "step": 6560 }, { "epoch": 1.37, "grad_norm": 1.4489260911941528, "learning_rate": 3.499015177847295e-06, "loss": 1.1933, "step": 6570 }, { "epoch": 1.37, "grad_norm": 1.2601078748703003, "learning_rate": 3.487429034874291e-06, "loss": 1.16, "step": 6580 }, { "epoch": 1.37, "grad_norm": 1.3712470531463623, "learning_rate": 3.4758428919012863e-06, "loss": 1.1848, "step": 6590 }, { "epoch": 1.38, "grad_norm": 1.5175118446350098, "learning_rate": 3.464256748928282e-06, "loss": 1.2016, "step": 6600 }, { "epoch": 1.38, "grad_norm": 1.3620766401290894, "learning_rate": 3.452670605955278e-06, "loss": 1.1572, "step": 6610 }, { "epoch": 1.38, "grad_norm": 1.3859511613845825, "learning_rate": 3.4410844629822738e-06, "loss": 1.179, "step": 6620 }, { "epoch": 1.38, "grad_norm": 1.3397611379623413, "learning_rate": 3.4294983200092693e-06, "loss": 1.1589, "step": 6630 }, { "epoch": 1.38, "grad_norm": 1.3851667642593384, "learning_rate": 3.417912177036265e-06, "loss": 1.1679, "step": 6640 }, { "epoch": 1.39, "grad_norm": 1.3727117776870728, "learning_rate": 3.4063260340632603e-06, "loss": 1.1722, "step": 6650 }, { "epoch": 1.39, "grad_norm": 1.3881512880325317, "learning_rate": 3.3947398910902563e-06, "loss": 1.1577, "step": 6660 }, { "epoch": 1.39, "grad_norm": 1.3602831363677979, "learning_rate": 3.3831537481172522e-06, "loss": 1.1661, "step": 6670 }, { "epoch": 1.39, "grad_norm": 1.3330827951431274, "learning_rate": 3.3715676051442473e-06, "loss": 1.1514, "step": 6680 }, { "epoch": 1.4, "grad_norm": 1.4532897472381592, "learning_rate": 3.3599814621712433e-06, "loss": 1.1774, "step": 6690 }, { "epoch": 1.4, "grad_norm": 1.3237956762313843, "learning_rate": 3.3483953191982392e-06, "loss": 1.1711, "step": 6700 }, { "epoch": 1.4, "grad_norm": 1.2269086837768555, "learning_rate": 3.3368091762252348e-06, "loss": 1.1624, "step": 6710 }, { "epoch": 1.4, "grad_norm": 1.2189347743988037, "learning_rate": 3.3252230332522307e-06, "loss": 1.1687, "step": 6720 }, { "epoch": 1.4, "grad_norm": 1.3546026945114136, "learning_rate": 3.3136368902792266e-06, "loss": 1.163, "step": 6730 }, { "epoch": 1.41, "grad_norm": 1.2999235391616821, "learning_rate": 3.3020507473062217e-06, "loss": 1.1456, "step": 6740 }, { "epoch": 1.41, "grad_norm": 1.30852210521698, "learning_rate": 3.2904646043332177e-06, "loss": 1.1892, "step": 6750 }, { "epoch": 1.41, "grad_norm": 1.546865463256836, "learning_rate": 3.2788784613602136e-06, "loss": 1.1683, "step": 6760 }, { "epoch": 1.41, "grad_norm": 1.6763235330581665, "learning_rate": 3.267292318387209e-06, "loss": 1.1826, "step": 6770 }, { "epoch": 1.41, "grad_norm": 1.3352094888687134, "learning_rate": 3.255706175414205e-06, "loss": 1.1588, "step": 6780 }, { "epoch": 1.42, "grad_norm": 1.5526775121688843, "learning_rate": 3.2441200324412002e-06, "loss": 1.16, "step": 6790 }, { "epoch": 1.42, "grad_norm": 1.2830092906951904, "learning_rate": 3.232533889468196e-06, "loss": 1.1536, "step": 6800 }, { "epoch": 1.42, "grad_norm": 1.3866894245147705, "learning_rate": 3.220947746495192e-06, "loss": 1.1466, "step": 6810 }, { "epoch": 1.42, "grad_norm": 1.4593145847320557, "learning_rate": 3.2093616035221876e-06, "loss": 1.2051, "step": 6820 }, { "epoch": 1.42, "grad_norm": 1.231041669845581, "learning_rate": 3.1977754605491836e-06, "loss": 1.1576, "step": 6830 }, { "epoch": 1.43, "grad_norm": 1.3121471405029297, "learning_rate": 3.1861893175761795e-06, "loss": 1.1806, "step": 6840 }, { "epoch": 1.43, "grad_norm": 1.513574242591858, "learning_rate": 3.1746031746031746e-06, "loss": 1.1721, "step": 6850 }, { "epoch": 1.43, "grad_norm": 1.3530925512313843, "learning_rate": 3.1630170316301706e-06, "loss": 1.1623, "step": 6860 }, { "epoch": 1.43, "grad_norm": 1.3993455171585083, "learning_rate": 3.1514308886571665e-06, "loss": 1.1674, "step": 6870 }, { "epoch": 1.43, "grad_norm": 1.3361057043075562, "learning_rate": 3.139844745684162e-06, "loss": 1.1677, "step": 6880 }, { "epoch": 1.44, "grad_norm": 1.4186984300613403, "learning_rate": 3.1282586027111576e-06, "loss": 1.1863, "step": 6890 }, { "epoch": 1.44, "grad_norm": 1.5800833702087402, "learning_rate": 3.1166724597381535e-06, "loss": 1.1733, "step": 6900 }, { "epoch": 1.44, "grad_norm": 1.3589065074920654, "learning_rate": 3.105086316765149e-06, "loss": 1.1497, "step": 6910 }, { "epoch": 1.44, "grad_norm": 1.3705077171325684, "learning_rate": 3.093500173792145e-06, "loss": 1.1578, "step": 6920 }, { "epoch": 1.45, "grad_norm": 1.3242499828338623, "learning_rate": 3.0819140308191405e-06, "loss": 1.1678, "step": 6930 }, { "epoch": 1.45, "grad_norm": 1.4003245830535889, "learning_rate": 3.070327887846136e-06, "loss": 1.1653, "step": 6940 }, { "epoch": 1.45, "grad_norm": 1.3863157033920288, "learning_rate": 3.058741744873132e-06, "loss": 1.1787, "step": 6950 }, { "epoch": 1.45, "grad_norm": 1.401785969734192, "learning_rate": 3.0471556019001275e-06, "loss": 1.1697, "step": 6960 }, { "epoch": 1.45, "grad_norm": 1.291735053062439, "learning_rate": 3.0355694589271235e-06, "loss": 1.1729, "step": 6970 }, { "epoch": 1.46, "grad_norm": 1.374833583831787, "learning_rate": 3.0239833159541194e-06, "loss": 1.1623, "step": 6980 }, { "epoch": 1.46, "grad_norm": 1.388802409172058, "learning_rate": 3.0123971729811145e-06, "loss": 1.1655, "step": 6990 }, { "epoch": 1.46, "grad_norm": 1.3213307857513428, "learning_rate": 3.0008110300081105e-06, "loss": 1.1698, "step": 7000 }, { "epoch": 1.46, "eval_loss": 1.188651204109192, "eval_runtime": 60.2084, "eval_samples_per_second": 102.976, "eval_steps_per_second": 12.872, "step": 7000 }, { "epoch": 1.46, "grad_norm": 1.284946322441101, "learning_rate": 2.9892248870351064e-06, "loss": 1.1619, "step": 7010 }, { "epoch": 1.46, "grad_norm": 1.4130306243896484, "learning_rate": 2.977638744062102e-06, "loss": 1.1714, "step": 7020 }, { "epoch": 1.47, "grad_norm": 1.4033979177474976, "learning_rate": 2.966052601089098e-06, "loss": 1.1564, "step": 7030 }, { "epoch": 1.47, "grad_norm": 1.303514003753662, "learning_rate": 2.954466458116093e-06, "loss": 1.1506, "step": 7040 }, { "epoch": 1.47, "grad_norm": 1.3504719734191895, "learning_rate": 2.942880315143089e-06, "loss": 1.1647, "step": 7050 }, { "epoch": 1.47, "grad_norm": 1.319595456123352, "learning_rate": 2.931294172170085e-06, "loss": 1.1663, "step": 7060 }, { "epoch": 1.47, "grad_norm": 1.6136802434921265, "learning_rate": 2.9197080291970804e-06, "loss": 1.1508, "step": 7070 }, { "epoch": 1.48, "grad_norm": 1.470433235168457, "learning_rate": 2.9081218862240764e-06, "loss": 1.1777, "step": 7080 }, { "epoch": 1.48, "grad_norm": 1.3718762397766113, "learning_rate": 2.8965357432510723e-06, "loss": 1.1589, "step": 7090 }, { "epoch": 1.48, "grad_norm": 1.4196834564208984, "learning_rate": 2.8849496002780674e-06, "loss": 1.1799, "step": 7100 }, { "epoch": 1.48, "grad_norm": 1.301185131072998, "learning_rate": 2.8733634573050634e-06, "loss": 1.1518, "step": 7110 }, { "epoch": 1.48, "grad_norm": 1.3427518606185913, "learning_rate": 2.8617773143320593e-06, "loss": 1.1566, "step": 7120 }, { "epoch": 1.49, "grad_norm": 1.4186878204345703, "learning_rate": 2.850191171359055e-06, "loss": 1.1825, "step": 7130 }, { "epoch": 1.49, "grad_norm": 1.3203967809677124, "learning_rate": 2.8386050283860508e-06, "loss": 1.1566, "step": 7140 }, { "epoch": 1.49, "grad_norm": 1.490299940109253, "learning_rate": 2.8270188854130463e-06, "loss": 1.1774, "step": 7150 }, { "epoch": 1.49, "grad_norm": 1.4719760417938232, "learning_rate": 2.815432742440042e-06, "loss": 1.1621, "step": 7160 }, { "epoch": 1.5, "grad_norm": 1.284829020500183, "learning_rate": 2.8038465994670378e-06, "loss": 1.1575, "step": 7170 }, { "epoch": 1.5, "grad_norm": 1.3767919540405273, "learning_rate": 2.7922604564940333e-06, "loss": 1.1623, "step": 7180 }, { "epoch": 1.5, "grad_norm": 1.3010685443878174, "learning_rate": 2.780674313521029e-06, "loss": 1.1686, "step": 7190 }, { "epoch": 1.5, "grad_norm": 1.3428490161895752, "learning_rate": 2.7690881705480248e-06, "loss": 1.1589, "step": 7200 }, { "epoch": 1.5, "grad_norm": 1.3504585027694702, "learning_rate": 2.7575020275750203e-06, "loss": 1.1815, "step": 7210 }, { "epoch": 1.51, "grad_norm": 1.4558221101760864, "learning_rate": 2.7459158846020162e-06, "loss": 1.1641, "step": 7220 }, { "epoch": 1.51, "grad_norm": 1.3460663557052612, "learning_rate": 2.734329741629012e-06, "loss": 1.137, "step": 7230 }, { "epoch": 1.51, "grad_norm": 1.5156904458999634, "learning_rate": 2.7227435986560073e-06, "loss": 1.1416, "step": 7240 }, { "epoch": 1.51, "grad_norm": 1.4338836669921875, "learning_rate": 2.7111574556830032e-06, "loss": 1.1661, "step": 7250 }, { "epoch": 1.51, "grad_norm": 1.2937146425247192, "learning_rate": 2.699571312709999e-06, "loss": 1.1619, "step": 7260 }, { "epoch": 1.52, "grad_norm": 1.3950343132019043, "learning_rate": 2.6879851697369947e-06, "loss": 1.1502, "step": 7270 }, { "epoch": 1.52, "grad_norm": 1.2576100826263428, "learning_rate": 2.6763990267639907e-06, "loss": 1.1817, "step": 7280 }, { "epoch": 1.52, "grad_norm": 1.3002537488937378, "learning_rate": 2.6648128837909866e-06, "loss": 1.148, "step": 7290 }, { "epoch": 1.52, "grad_norm": 1.474173903465271, "learning_rate": 2.6532267408179817e-06, "loss": 1.16, "step": 7300 }, { "epoch": 1.52, "grad_norm": 1.361527681350708, "learning_rate": 2.6416405978449777e-06, "loss": 1.1629, "step": 7310 }, { "epoch": 1.53, "grad_norm": 1.3156787157058716, "learning_rate": 2.630054454871973e-06, "loss": 1.1701, "step": 7320 }, { "epoch": 1.53, "grad_norm": 1.49290132522583, "learning_rate": 2.618468311898969e-06, "loss": 1.164, "step": 7330 }, { "epoch": 1.53, "grad_norm": 1.3317513465881348, "learning_rate": 2.606882168925965e-06, "loss": 1.1531, "step": 7340 }, { "epoch": 1.53, "grad_norm": 1.3225781917572021, "learning_rate": 2.59529602595296e-06, "loss": 1.1513, "step": 7350 }, { "epoch": 1.53, "grad_norm": 1.4606074094772339, "learning_rate": 2.583709882979956e-06, "loss": 1.1596, "step": 7360 }, { "epoch": 1.54, "grad_norm": 1.5031137466430664, "learning_rate": 2.572123740006952e-06, "loss": 1.1519, "step": 7370 }, { "epoch": 1.54, "grad_norm": 1.413018822669983, "learning_rate": 2.5605375970339476e-06, "loss": 1.1715, "step": 7380 }, { "epoch": 1.54, "grad_norm": 1.440049409866333, "learning_rate": 2.5489514540609435e-06, "loss": 1.1665, "step": 7390 }, { "epoch": 1.54, "grad_norm": 1.4148393869400024, "learning_rate": 2.537365311087939e-06, "loss": 1.1515, "step": 7400 }, { "epoch": 1.55, "grad_norm": 1.4196386337280273, "learning_rate": 2.5257791681149346e-06, "loss": 1.1542, "step": 7410 }, { "epoch": 1.55, "grad_norm": 1.4080944061279297, "learning_rate": 2.5141930251419305e-06, "loss": 1.1662, "step": 7420 }, { "epoch": 1.55, "grad_norm": 1.366580843925476, "learning_rate": 2.5026068821689265e-06, "loss": 1.1656, "step": 7430 }, { "epoch": 1.55, "grad_norm": 1.329052209854126, "learning_rate": 2.491020739195922e-06, "loss": 1.1562, "step": 7440 }, { "epoch": 1.55, "grad_norm": 1.4457416534423828, "learning_rate": 2.4794345962229175e-06, "loss": 1.1757, "step": 7450 }, { "epoch": 1.56, "grad_norm": 1.3773384094238281, "learning_rate": 2.4678484532499135e-06, "loss": 1.1724, "step": 7460 }, { "epoch": 1.56, "grad_norm": 1.4378938674926758, "learning_rate": 2.456262310276909e-06, "loss": 1.1599, "step": 7470 }, { "epoch": 1.56, "grad_norm": 1.4069546461105347, "learning_rate": 2.4446761673039045e-06, "loss": 1.1784, "step": 7480 }, { "epoch": 1.56, "grad_norm": 1.4680144786834717, "learning_rate": 2.4330900243309005e-06, "loss": 1.1548, "step": 7490 }, { "epoch": 1.56, "grad_norm": 1.4188005924224854, "learning_rate": 2.421503881357896e-06, "loss": 1.1599, "step": 7500 }, { "epoch": 1.57, "grad_norm": 1.3987394571304321, "learning_rate": 2.409917738384892e-06, "loss": 1.1674, "step": 7510 }, { "epoch": 1.57, "grad_norm": 1.3398780822753906, "learning_rate": 2.398331595411888e-06, "loss": 1.1457, "step": 7520 }, { "epoch": 1.57, "grad_norm": 1.3323601484298706, "learning_rate": 2.3867454524388834e-06, "loss": 1.1583, "step": 7530 }, { "epoch": 1.57, "grad_norm": 1.4326480627059937, "learning_rate": 2.375159309465879e-06, "loss": 1.1746, "step": 7540 }, { "epoch": 1.57, "grad_norm": 1.4204566478729248, "learning_rate": 2.3635731664928745e-06, "loss": 1.1401, "step": 7550 }, { "epoch": 1.58, "grad_norm": 1.554408073425293, "learning_rate": 2.3519870235198704e-06, "loss": 1.1531, "step": 7560 }, { "epoch": 1.58, "grad_norm": 1.4092999696731567, "learning_rate": 2.340400880546866e-06, "loss": 1.1601, "step": 7570 }, { "epoch": 1.58, "grad_norm": 1.502114176750183, "learning_rate": 2.328814737573862e-06, "loss": 1.1625, "step": 7580 }, { "epoch": 1.58, "grad_norm": 1.319533348083496, "learning_rate": 2.317228594600858e-06, "loss": 1.1411, "step": 7590 }, { "epoch": 1.58, "grad_norm": 1.3012977838516235, "learning_rate": 2.3056424516278534e-06, "loss": 1.1716, "step": 7600 }, { "epoch": 1.59, "grad_norm": 1.3816500902175903, "learning_rate": 2.294056308654849e-06, "loss": 1.1468, "step": 7610 }, { "epoch": 1.59, "grad_norm": 1.4806541204452515, "learning_rate": 2.2824701656818444e-06, "loss": 1.139, "step": 7620 }, { "epoch": 1.59, "grad_norm": 1.4676132202148438, "learning_rate": 2.2708840227088404e-06, "loss": 1.1717, "step": 7630 }, { "epoch": 1.59, "grad_norm": 1.4198739528656006, "learning_rate": 2.2592978797358363e-06, "loss": 1.1671, "step": 7640 }, { "epoch": 1.6, "grad_norm": 1.5252964496612549, "learning_rate": 2.247711736762832e-06, "loss": 1.149, "step": 7650 }, { "epoch": 1.6, "grad_norm": 1.4139654636383057, "learning_rate": 2.2361255937898278e-06, "loss": 1.1606, "step": 7660 }, { "epoch": 1.6, "grad_norm": 1.3876101970672607, "learning_rate": 2.2245394508168233e-06, "loss": 1.1421, "step": 7670 }, { "epoch": 1.6, "grad_norm": 1.2850245237350464, "learning_rate": 2.212953307843819e-06, "loss": 1.1725, "step": 7680 }, { "epoch": 1.6, "grad_norm": 1.2772204875946045, "learning_rate": 2.2013671648708148e-06, "loss": 1.1612, "step": 7690 }, { "epoch": 1.61, "grad_norm": 1.4063725471496582, "learning_rate": 2.1897810218978103e-06, "loss": 1.18, "step": 7700 }, { "epoch": 1.61, "grad_norm": 1.239322543144226, "learning_rate": 2.1781948789248063e-06, "loss": 1.1477, "step": 7710 }, { "epoch": 1.61, "grad_norm": 1.381710171699524, "learning_rate": 2.1666087359518018e-06, "loss": 1.1786, "step": 7720 }, { "epoch": 1.61, "grad_norm": 1.2689425945281982, "learning_rate": 2.1550225929787973e-06, "loss": 1.1655, "step": 7730 }, { "epoch": 1.61, "grad_norm": 1.4614635705947876, "learning_rate": 2.1434364500057933e-06, "loss": 1.1742, "step": 7740 }, { "epoch": 1.62, "grad_norm": 1.383084774017334, "learning_rate": 2.1318503070327888e-06, "loss": 1.1628, "step": 7750 }, { "epoch": 1.62, "grad_norm": 1.4612348079681396, "learning_rate": 2.1202641640597847e-06, "loss": 1.1829, "step": 7760 }, { "epoch": 1.62, "grad_norm": 1.4572889804840088, "learning_rate": 2.1086780210867807e-06, "loss": 1.1262, "step": 7770 }, { "epoch": 1.62, "grad_norm": 1.3573559522628784, "learning_rate": 2.097091878113776e-06, "loss": 1.1657, "step": 7780 }, { "epoch": 1.62, "grad_norm": 1.3430317640304565, "learning_rate": 2.0855057351407717e-06, "loss": 1.1563, "step": 7790 }, { "epoch": 1.63, "grad_norm": 1.3933885097503662, "learning_rate": 2.0739195921677672e-06, "loss": 1.1655, "step": 7800 }, { "epoch": 1.63, "grad_norm": 1.3937543630599976, "learning_rate": 2.062333449194763e-06, "loss": 1.1554, "step": 7810 }, { "epoch": 1.63, "grad_norm": 1.422379493713379, "learning_rate": 2.050747306221759e-06, "loss": 1.1785, "step": 7820 }, { "epoch": 1.63, "grad_norm": 1.4572478532791138, "learning_rate": 2.0391611632487547e-06, "loss": 1.1618, "step": 7830 }, { "epoch": 1.63, "grad_norm": 1.5157662630081177, "learning_rate": 2.0275750202757506e-06, "loss": 1.1577, "step": 7840 }, { "epoch": 1.64, "grad_norm": 1.4413377046585083, "learning_rate": 2.015988877302746e-06, "loss": 1.171, "step": 7850 }, { "epoch": 1.64, "grad_norm": 1.472377061843872, "learning_rate": 2.0044027343297417e-06, "loss": 1.1562, "step": 7860 }, { "epoch": 1.64, "grad_norm": 1.3218101263046265, "learning_rate": 1.992816591356737e-06, "loss": 1.1528, "step": 7870 }, { "epoch": 1.64, "grad_norm": 1.460989236831665, "learning_rate": 1.981230448383733e-06, "loss": 1.1568, "step": 7880 }, { "epoch": 1.65, "grad_norm": 1.3286033868789673, "learning_rate": 1.969644305410729e-06, "loss": 1.1467, "step": 7890 }, { "epoch": 1.65, "grad_norm": 1.386688470840454, "learning_rate": 1.9580581624377246e-06, "loss": 1.162, "step": 7900 }, { "epoch": 1.65, "grad_norm": 1.3312408924102783, "learning_rate": 1.9464720194647206e-06, "loss": 1.1483, "step": 7910 }, { "epoch": 1.65, "grad_norm": 1.3570996522903442, "learning_rate": 1.934885876491716e-06, "loss": 1.1552, "step": 7920 }, { "epoch": 1.65, "grad_norm": 1.502630352973938, "learning_rate": 1.9232997335187116e-06, "loss": 1.1627, "step": 7930 }, { "epoch": 1.66, "grad_norm": 1.2717859745025635, "learning_rate": 1.9117135905457076e-06, "loss": 1.1518, "step": 7940 }, { "epoch": 1.66, "grad_norm": 1.4142167568206787, "learning_rate": 1.9001274475727033e-06, "loss": 1.1686, "step": 7950 }, { "epoch": 1.66, "grad_norm": 1.3600444793701172, "learning_rate": 1.888541304599699e-06, "loss": 1.1585, "step": 7960 }, { "epoch": 1.66, "grad_norm": 1.3272130489349365, "learning_rate": 1.8769551616266945e-06, "loss": 1.1369, "step": 7970 }, { "epoch": 1.66, "grad_norm": 1.4064016342163086, "learning_rate": 1.8653690186536905e-06, "loss": 1.1551, "step": 7980 }, { "epoch": 1.67, "grad_norm": 1.4075134992599487, "learning_rate": 1.853782875680686e-06, "loss": 1.1771, "step": 7990 }, { "epoch": 1.67, "grad_norm": 1.4398192167282104, "learning_rate": 1.8421967327076818e-06, "loss": 1.1643, "step": 8000 }, { "epoch": 1.67, "eval_loss": 1.1820555925369263, "eval_runtime": 60.2709, "eval_samples_per_second": 102.869, "eval_steps_per_second": 12.859, "step": 8000 }, { "epoch": 1.67, "grad_norm": 1.490997076034546, "learning_rate": 1.8306105897346773e-06, "loss": 1.1809, "step": 8010 }, { "epoch": 1.67, "grad_norm": 1.3520011901855469, "learning_rate": 1.8190244467616732e-06, "loss": 1.1466, "step": 8020 }, { "epoch": 1.67, "grad_norm": 1.3630290031433105, "learning_rate": 1.807438303788669e-06, "loss": 1.1692, "step": 8030 }, { "epoch": 1.68, "grad_norm": 1.489708423614502, "learning_rate": 1.7958521608156645e-06, "loss": 1.1581, "step": 8040 }, { "epoch": 1.68, "grad_norm": 1.386827826499939, "learning_rate": 1.7842660178426604e-06, "loss": 1.1536, "step": 8050 }, { "epoch": 1.68, "grad_norm": 1.3416543006896973, "learning_rate": 1.7726798748696562e-06, "loss": 1.1627, "step": 8060 }, { "epoch": 1.68, "grad_norm": 1.4776194095611572, "learning_rate": 1.7610937318966517e-06, "loss": 1.1496, "step": 8070 }, { "epoch": 1.69, "grad_norm": 1.3277490139007568, "learning_rate": 1.7495075889236474e-06, "loss": 1.1672, "step": 8080 }, { "epoch": 1.69, "grad_norm": 1.3496487140655518, "learning_rate": 1.7379214459506432e-06, "loss": 1.1578, "step": 8090 }, { "epoch": 1.69, "grad_norm": 1.4114041328430176, "learning_rate": 1.726335302977639e-06, "loss": 1.1716, "step": 8100 }, { "epoch": 1.69, "grad_norm": 1.4332119226455688, "learning_rate": 1.7147491600046346e-06, "loss": 1.1527, "step": 8110 }, { "epoch": 1.69, "grad_norm": 1.508005976676941, "learning_rate": 1.7031630170316302e-06, "loss": 1.1629, "step": 8120 }, { "epoch": 1.7, "grad_norm": 1.3349090814590454, "learning_rate": 1.6915768740586261e-06, "loss": 1.1536, "step": 8130 }, { "epoch": 1.7, "grad_norm": 1.4674837589263916, "learning_rate": 1.6799907310856216e-06, "loss": 1.1704, "step": 8140 }, { "epoch": 1.7, "grad_norm": 1.3746360540390015, "learning_rate": 1.6684045881126174e-06, "loss": 1.1495, "step": 8150 }, { "epoch": 1.7, "grad_norm": 1.370354175567627, "learning_rate": 1.6568184451396133e-06, "loss": 1.1597, "step": 8160 }, { "epoch": 1.7, "grad_norm": 1.3676754236221313, "learning_rate": 1.6452323021666088e-06, "loss": 1.1516, "step": 8170 }, { "epoch": 1.71, "grad_norm": 1.399137020111084, "learning_rate": 1.6336461591936046e-06, "loss": 1.175, "step": 8180 }, { "epoch": 1.71, "grad_norm": 1.4263873100280762, "learning_rate": 1.6220600162206001e-06, "loss": 1.168, "step": 8190 }, { "epoch": 1.71, "grad_norm": 1.2758104801177979, "learning_rate": 1.610473873247596e-06, "loss": 1.1683, "step": 8200 }, { "epoch": 1.71, "grad_norm": 1.4730124473571777, "learning_rate": 1.5988877302745918e-06, "loss": 1.179, "step": 8210 }, { "epoch": 1.71, "grad_norm": 1.3805128335952759, "learning_rate": 1.5873015873015873e-06, "loss": 1.1105, "step": 8220 }, { "epoch": 1.72, "grad_norm": 1.3347926139831543, "learning_rate": 1.5757154443285833e-06, "loss": 1.1447, "step": 8230 }, { "epoch": 1.72, "grad_norm": 1.3892558813095093, "learning_rate": 1.5641293013555788e-06, "loss": 1.1397, "step": 8240 }, { "epoch": 1.72, "grad_norm": 1.410576581954956, "learning_rate": 1.5525431583825745e-06, "loss": 1.159, "step": 8250 }, { "epoch": 1.72, "grad_norm": 1.4224263429641724, "learning_rate": 1.5409570154095703e-06, "loss": 1.1605, "step": 8260 }, { "epoch": 1.72, "grad_norm": 1.3661879301071167, "learning_rate": 1.529370872436566e-06, "loss": 1.1622, "step": 8270 }, { "epoch": 1.73, "grad_norm": 1.5230523347854614, "learning_rate": 1.5177847294635617e-06, "loss": 1.1594, "step": 8280 }, { "epoch": 1.73, "grad_norm": 1.5382969379425049, "learning_rate": 1.5061985864905573e-06, "loss": 1.1793, "step": 8290 }, { "epoch": 1.73, "grad_norm": 1.2719836235046387, "learning_rate": 1.4946124435175532e-06, "loss": 1.1346, "step": 8300 }, { "epoch": 1.73, "grad_norm": 1.3252252340316772, "learning_rate": 1.483026300544549e-06, "loss": 1.1519, "step": 8310 }, { "epoch": 1.74, "grad_norm": 1.2065455913543701, "learning_rate": 1.4714401575715445e-06, "loss": 1.1581, "step": 8320 }, { "epoch": 1.74, "grad_norm": 1.382408857345581, "learning_rate": 1.4598540145985402e-06, "loss": 1.1335, "step": 8330 }, { "epoch": 1.74, "grad_norm": 1.4275171756744385, "learning_rate": 1.4482678716255362e-06, "loss": 1.1686, "step": 8340 }, { "epoch": 1.74, "grad_norm": 1.332566261291504, "learning_rate": 1.4366817286525317e-06, "loss": 1.1786, "step": 8350 }, { "epoch": 1.74, "grad_norm": 1.3520534038543701, "learning_rate": 1.4250955856795274e-06, "loss": 1.1677, "step": 8360 }, { "epoch": 1.75, "grad_norm": 1.4580789804458618, "learning_rate": 1.4135094427065231e-06, "loss": 1.1404, "step": 8370 }, { "epoch": 1.75, "grad_norm": 1.307956576347351, "learning_rate": 1.4019232997335189e-06, "loss": 1.1605, "step": 8380 }, { "epoch": 1.75, "grad_norm": 1.3615368604660034, "learning_rate": 1.3903371567605144e-06, "loss": 1.1574, "step": 8390 }, { "epoch": 1.75, "grad_norm": 1.4463071823120117, "learning_rate": 1.3787510137875101e-06, "loss": 1.1534, "step": 8400 }, { "epoch": 1.75, "grad_norm": 1.432977557182312, "learning_rate": 1.367164870814506e-06, "loss": 1.1603, "step": 8410 }, { "epoch": 1.76, "grad_norm": 1.3181747198104858, "learning_rate": 1.3555787278415016e-06, "loss": 1.1707, "step": 8420 }, { "epoch": 1.76, "grad_norm": 1.481392502784729, "learning_rate": 1.3439925848684974e-06, "loss": 1.1743, "step": 8430 }, { "epoch": 1.76, "grad_norm": 1.4732555150985718, "learning_rate": 1.3324064418954933e-06, "loss": 1.1741, "step": 8440 }, { "epoch": 1.76, "grad_norm": 1.4790183305740356, "learning_rate": 1.3208202989224888e-06, "loss": 1.1821, "step": 8450 }, { "epoch": 1.76, "grad_norm": 1.2341439723968506, "learning_rate": 1.3092341559494846e-06, "loss": 1.1661, "step": 8460 }, { "epoch": 1.77, "grad_norm": 1.4015235900878906, "learning_rate": 1.29764801297648e-06, "loss": 1.1704, "step": 8470 }, { "epoch": 1.77, "grad_norm": 1.2874525785446167, "learning_rate": 1.286061870003476e-06, "loss": 1.1584, "step": 8480 }, { "epoch": 1.77, "grad_norm": 1.497933030128479, "learning_rate": 1.2744757270304718e-06, "loss": 1.1648, "step": 8490 }, { "epoch": 1.77, "grad_norm": 1.3120431900024414, "learning_rate": 1.2628895840574673e-06, "loss": 1.187, "step": 8500 }, { "epoch": 1.77, "grad_norm": 1.327337622642517, "learning_rate": 1.2513034410844632e-06, "loss": 1.1738, "step": 8510 }, { "epoch": 1.78, "grad_norm": 1.224055528640747, "learning_rate": 1.2397172981114588e-06, "loss": 1.1518, "step": 8520 }, { "epoch": 1.78, "grad_norm": 1.5562435388565063, "learning_rate": 1.2281311551384545e-06, "loss": 1.1634, "step": 8530 }, { "epoch": 1.78, "grad_norm": 1.4716066122055054, "learning_rate": 1.2165450121654502e-06, "loss": 1.162, "step": 8540 }, { "epoch": 1.78, "grad_norm": 1.395019769668579, "learning_rate": 1.204958869192446e-06, "loss": 1.1696, "step": 8550 }, { "epoch": 1.79, "grad_norm": 1.557733416557312, "learning_rate": 1.1933727262194417e-06, "loss": 1.1699, "step": 8560 }, { "epoch": 1.79, "grad_norm": 1.4722071886062622, "learning_rate": 1.1817865832464372e-06, "loss": 1.1678, "step": 8570 }, { "epoch": 1.79, "grad_norm": 1.4380254745483398, "learning_rate": 1.170200440273433e-06, "loss": 1.1694, "step": 8580 }, { "epoch": 1.79, "grad_norm": 1.3241981267929077, "learning_rate": 1.158614297300429e-06, "loss": 1.1539, "step": 8590 }, { "epoch": 1.79, "grad_norm": 1.4241721630096436, "learning_rate": 1.1470281543274244e-06, "loss": 1.1493, "step": 8600 }, { "epoch": 1.8, "grad_norm": 1.2441765069961548, "learning_rate": 1.1354420113544202e-06, "loss": 1.143, "step": 8610 }, { "epoch": 1.8, "grad_norm": 1.3431496620178223, "learning_rate": 1.123855868381416e-06, "loss": 1.1343, "step": 8620 }, { "epoch": 1.8, "grad_norm": 1.4855791330337524, "learning_rate": 1.1122697254084117e-06, "loss": 1.1424, "step": 8630 }, { "epoch": 1.8, "grad_norm": 1.3640236854553223, "learning_rate": 1.1006835824354074e-06, "loss": 1.1751, "step": 8640 }, { "epoch": 1.8, "grad_norm": 1.414133906364441, "learning_rate": 1.0890974394624031e-06, "loss": 1.1595, "step": 8650 }, { "epoch": 1.81, "grad_norm": 1.3939659595489502, "learning_rate": 1.0775112964893987e-06, "loss": 1.1735, "step": 8660 }, { "epoch": 1.81, "grad_norm": 1.5434401035308838, "learning_rate": 1.0659251535163944e-06, "loss": 1.1629, "step": 8670 }, { "epoch": 1.81, "grad_norm": 1.3433867692947388, "learning_rate": 1.0543390105433903e-06, "loss": 1.1606, "step": 8680 }, { "epoch": 1.81, "grad_norm": 1.4562678337097168, "learning_rate": 1.0427528675703859e-06, "loss": 1.168, "step": 8690 }, { "epoch": 1.81, "grad_norm": 1.3237522840499878, "learning_rate": 1.0311667245973816e-06, "loss": 1.1599, "step": 8700 }, { "epoch": 1.82, "grad_norm": 1.326677680015564, "learning_rate": 1.0195805816243773e-06, "loss": 1.1587, "step": 8710 }, { "epoch": 1.82, "grad_norm": 1.3268414735794067, "learning_rate": 1.007994438651373e-06, "loss": 1.154, "step": 8720 }, { "epoch": 1.82, "grad_norm": 1.4188928604125977, "learning_rate": 9.964082956783686e-07, "loss": 1.1608, "step": 8730 }, { "epoch": 1.82, "grad_norm": 1.4449864625930786, "learning_rate": 9.848221527053645e-07, "loss": 1.164, "step": 8740 }, { "epoch": 1.82, "grad_norm": 1.3237680196762085, "learning_rate": 9.732360097323603e-07, "loss": 1.1547, "step": 8750 }, { "epoch": 1.83, "grad_norm": 1.4183787107467651, "learning_rate": 9.616498667593558e-07, "loss": 1.1632, "step": 8760 }, { "epoch": 1.83, "grad_norm": 1.3433119058609009, "learning_rate": 9.500637237863516e-07, "loss": 1.1542, "step": 8770 }, { "epoch": 1.83, "grad_norm": 1.5118272304534912, "learning_rate": 9.384775808133473e-07, "loss": 1.1631, "step": 8780 }, { "epoch": 1.83, "grad_norm": 1.470158338546753, "learning_rate": 9.26891437840343e-07, "loss": 1.1638, "step": 8790 }, { "epoch": 1.84, "grad_norm": 1.408806324005127, "learning_rate": 9.153052948673386e-07, "loss": 1.1676, "step": 8800 }, { "epoch": 1.84, "grad_norm": 1.4310715198516846, "learning_rate": 9.037191518943345e-07, "loss": 1.1606, "step": 8810 }, { "epoch": 1.84, "grad_norm": 1.4561901092529297, "learning_rate": 8.921330089213302e-07, "loss": 1.1639, "step": 8820 }, { "epoch": 1.84, "grad_norm": 1.3165732622146606, "learning_rate": 8.805468659483258e-07, "loss": 1.1619, "step": 8830 }, { "epoch": 1.84, "grad_norm": 1.4040567874908447, "learning_rate": 8.689607229753216e-07, "loss": 1.1413, "step": 8840 }, { "epoch": 1.85, "grad_norm": 1.3458449840545654, "learning_rate": 8.573745800023173e-07, "loss": 1.1518, "step": 8850 }, { "epoch": 1.85, "grad_norm": 1.441347599029541, "learning_rate": 8.457884370293131e-07, "loss": 1.136, "step": 8860 }, { "epoch": 1.85, "grad_norm": 1.4389218091964722, "learning_rate": 8.342022940563087e-07, "loss": 1.1782, "step": 8870 }, { "epoch": 1.85, "grad_norm": 1.6026980876922607, "learning_rate": 8.226161510833044e-07, "loss": 1.1606, "step": 8880 }, { "epoch": 1.85, "grad_norm": 1.5719023942947388, "learning_rate": 8.110300081103001e-07, "loss": 1.1482, "step": 8890 }, { "epoch": 1.86, "grad_norm": 1.302774429321289, "learning_rate": 7.994438651372959e-07, "loss": 1.1545, "step": 8900 }, { "epoch": 1.86, "grad_norm": 1.3073673248291016, "learning_rate": 7.878577221642916e-07, "loss": 1.1514, "step": 8910 }, { "epoch": 1.86, "grad_norm": 1.4172277450561523, "learning_rate": 7.762715791912873e-07, "loss": 1.156, "step": 8920 }, { "epoch": 1.86, "grad_norm": 1.4633533954620361, "learning_rate": 7.64685436218283e-07, "loss": 1.1578, "step": 8930 }, { "epoch": 1.86, "grad_norm": 1.3664774894714355, "learning_rate": 7.530992932452786e-07, "loss": 1.1529, "step": 8940 }, { "epoch": 1.87, "grad_norm": 1.4149627685546875, "learning_rate": 7.415131502722745e-07, "loss": 1.1512, "step": 8950 }, { "epoch": 1.87, "grad_norm": 1.4857783317565918, "learning_rate": 7.299270072992701e-07, "loss": 1.1465, "step": 8960 }, { "epoch": 1.87, "grad_norm": 1.4186878204345703, "learning_rate": 7.183408643262658e-07, "loss": 1.1556, "step": 8970 }, { "epoch": 1.87, "grad_norm": 1.4118033647537231, "learning_rate": 7.067547213532616e-07, "loss": 1.1484, "step": 8980 }, { "epoch": 1.87, "grad_norm": 1.396365761756897, "learning_rate": 6.951685783802572e-07, "loss": 1.1646, "step": 8990 }, { "epoch": 1.88, "grad_norm": 1.5443834066390991, "learning_rate": 6.83582435407253e-07, "loss": 1.1621, "step": 9000 }, { "epoch": 1.88, "eval_loss": 1.178328037261963, "eval_runtime": 60.0885, "eval_samples_per_second": 103.181, "eval_steps_per_second": 12.898, "step": 9000 } ], "logging_steps": 10, "max_steps": 9590, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1, "total_flos": 7.483761454915518e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }