{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.97948717948718, "eval_steps": 1000, "global_step": 1752, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06837606837606838, "grad_norm": 10.4375, "learning_rate": 5.681818181818182e-07, "loss": 2.6042, "step": 10 }, { "epoch": 0.13675213675213677, "grad_norm": 12.75, "learning_rate": 1.1363636363636364e-06, "loss": 2.577, "step": 20 }, { "epoch": 0.20512820512820512, "grad_norm": 10.6875, "learning_rate": 1.7045454545454546e-06, "loss": 2.4842, "step": 30 }, { "epoch": 0.27350427350427353, "grad_norm": 15.4375, "learning_rate": 2.2727272727272728e-06, "loss": 2.4127, "step": 40 }, { "epoch": 0.3418803418803419, "grad_norm": 21.75, "learning_rate": 2.8409090909090916e-06, "loss": 2.2065, "step": 50 }, { "epoch": 0.41025641025641024, "grad_norm": 28.125, "learning_rate": 3.409090909090909e-06, "loss": 2.2638, "step": 60 }, { "epoch": 0.47863247863247865, "grad_norm": 29.5, "learning_rate": 3.9772727272727275e-06, "loss": 2.0591, "step": 70 }, { "epoch": 0.5470085470085471, "grad_norm": 20.5, "learning_rate": 4.5454545454545455e-06, "loss": 1.9693, "step": 80 }, { "epoch": 0.6153846153846154, "grad_norm": 19.875, "learning_rate": 5.113636363636364e-06, "loss": 1.9559, "step": 90 }, { "epoch": 0.6837606837606838, "grad_norm": 5.125, "learning_rate": 5.681818181818183e-06, "loss": 1.8579, "step": 100 }, { "epoch": 0.7521367521367521, "grad_norm": 5.5625, "learning_rate": 6.25e-06, "loss": 1.8601, "step": 110 }, { "epoch": 0.8205128205128205, "grad_norm": 4.34375, "learning_rate": 6.818181818181818e-06, "loss": 1.7222, "step": 120 }, { "epoch": 0.8888888888888888, "grad_norm": 6.03125, "learning_rate": 7.386363636363637e-06, "loss": 1.7724, "step": 130 }, { "epoch": 0.9572649572649573, "grad_norm": 5.6875, "learning_rate": 7.954545454545455e-06, "loss": 1.8647, "step": 140 }, { "epoch": 1.0256410256410255, "grad_norm": 5.0625, "learning_rate": 8.522727272727273e-06, "loss": 1.7017, "step": 150 }, { "epoch": 1.0940170940170941, "grad_norm": 4.5625, "learning_rate": 9.090909090909091e-06, "loss": 1.6883, "step": 160 }, { "epoch": 1.1623931623931625, "grad_norm": 5.3125, "learning_rate": 9.65909090909091e-06, "loss": 1.5485, "step": 170 }, { "epoch": 1.2307692307692308, "grad_norm": 4.96875, "learning_rate": 9.999841055681184e-06, "loss": 1.5548, "step": 180 }, { "epoch": 1.2991452991452992, "grad_norm": 5.4375, "learning_rate": 9.998053048145735e-06, "loss": 1.602, "step": 190 }, { "epoch": 1.3675213675213675, "grad_norm": 3.671875, "learning_rate": 9.994279065509094e-06, "loss": 1.5151, "step": 200 }, { "epoch": 1.435897435897436, "grad_norm": 5.1875, "learning_rate": 9.988520607362297e-06, "loss": 1.4208, "step": 210 }, { "epoch": 1.5042735042735043, "grad_norm": 5.90625, "learning_rate": 9.98077996182722e-06, "loss": 1.4954, "step": 220 }, { "epoch": 1.5726495726495726, "grad_norm": 5.84375, "learning_rate": 9.971060204647384e-06, "loss": 1.3399, "step": 230 }, { "epoch": 1.641025641025641, "grad_norm": 5.5, "learning_rate": 9.959365197965824e-06, "loss": 1.2554, "step": 240 }, { "epoch": 1.7094017094017095, "grad_norm": 6.65625, "learning_rate": 9.945699588790455e-06, "loss": 1.3161, "step": 250 }, { "epoch": 1.7777777777777777, "grad_norm": 7.5625, "learning_rate": 9.930068807147585e-06, "loss": 1.3603, "step": 260 }, { "epoch": 1.8461538461538463, "grad_norm": 6.375, "learning_rate": 9.912479063924309e-06, "loss": 1.2576, "step": 270 }, { "epoch": 1.9145299145299144, "grad_norm": 6.84375, "learning_rate": 9.8929373484006e-06, "loss": 1.207, "step": 280 }, { "epoch": 1.982905982905983, "grad_norm": 4.90625, "learning_rate": 9.871451425472128e-06, "loss": 1.232, "step": 290 }, { "epoch": 2.051282051282051, "grad_norm": 7.125, "learning_rate": 9.848029832564875e-06, "loss": 1.0106, "step": 300 }, { "epoch": 2.1196581196581197, "grad_norm": 6.65625, "learning_rate": 9.822681876242797e-06, "loss": 0.9484, "step": 310 }, { "epoch": 2.1880341880341883, "grad_norm": 5.1875, "learning_rate": 9.795417628509857e-06, "loss": 1.0094, "step": 320 }, { "epoch": 2.2564102564102564, "grad_norm": 5.28125, "learning_rate": 9.766247922807927e-06, "loss": 0.9317, "step": 330 }, { "epoch": 2.324786324786325, "grad_norm": 5.09375, "learning_rate": 9.73518434971211e-06, "loss": 0.8936, "step": 340 }, { "epoch": 2.393162393162393, "grad_norm": 7.375, "learning_rate": 9.702239252325237e-06, "loss": 0.9004, "step": 350 }, { "epoch": 2.4615384615384617, "grad_norm": 6.3125, "learning_rate": 9.667425721373333e-06, "loss": 0.9335, "step": 360 }, { "epoch": 2.52991452991453, "grad_norm": 7.0625, "learning_rate": 9.630757590004023e-06, "loss": 0.8982, "step": 370 }, { "epoch": 2.5982905982905984, "grad_norm": 4.40625, "learning_rate": 9.592249428289935e-06, "loss": 0.8541, "step": 380 }, { "epoch": 2.6666666666666665, "grad_norm": 6.5625, "learning_rate": 9.551916537439282e-06, "loss": 0.8105, "step": 390 }, { "epoch": 2.735042735042735, "grad_norm": 5.125, "learning_rate": 9.50977494371594e-06, "loss": 0.8308, "step": 400 }, { "epoch": 2.8034188034188032, "grad_norm": 5.0625, "learning_rate": 9.465841392071396e-06, "loss": 0.8515, "step": 410 }, { "epoch": 2.871794871794872, "grad_norm": 6.53125, "learning_rate": 9.420133339491171e-06, "loss": 0.6671, "step": 420 }, { "epoch": 2.9401709401709404, "grad_norm": 6.21875, "learning_rate": 9.372668948058276e-06, "loss": 0.7728, "step": 430 }, { "epoch": 3.0085470085470085, "grad_norm": 5.1875, "learning_rate": 9.323467077736513e-06, "loss": 0.6978, "step": 440 }, { "epoch": 3.076923076923077, "grad_norm": 5.9375, "learning_rate": 9.272547278876475e-06, "loss": 0.561, "step": 450 }, { "epoch": 3.1452991452991452, "grad_norm": 5.21875, "learning_rate": 9.219929784447232e-06, "loss": 0.5354, "step": 460 }, { "epoch": 3.213675213675214, "grad_norm": 4.90625, "learning_rate": 9.16563550199674e-06, "loss": 0.5613, "step": 470 }, { "epoch": 3.282051282051282, "grad_norm": 6.5, "learning_rate": 9.109686005344258e-06, "loss": 0.576, "step": 480 }, { "epoch": 3.3504273504273505, "grad_norm": 5.71875, "learning_rate": 9.052103526007976e-06, "loss": 0.492, "step": 490 }, { "epoch": 3.4188034188034186, "grad_norm": 7.5, "learning_rate": 8.992910944371343e-06, "loss": 0.5087, "step": 500 }, { "epoch": 3.4871794871794872, "grad_norm": 6.59375, "learning_rate": 8.932131780591542e-06, "loss": 0.476, "step": 510 }, { "epoch": 3.5555555555555554, "grad_norm": 4.4375, "learning_rate": 8.869790185253766e-06, "loss": 0.4111, "step": 520 }, { "epoch": 3.623931623931624, "grad_norm": 5.09375, "learning_rate": 8.805910929774989e-06, "loss": 0.4426, "step": 530 }, { "epoch": 3.6923076923076925, "grad_norm": 4.46875, "learning_rate": 8.740519396561045e-06, "loss": 0.4171, "step": 540 }, { "epoch": 3.7606837606837606, "grad_norm": 5.28125, "learning_rate": 8.673641568920944e-06, "loss": 0.445, "step": 550 }, { "epoch": 3.8290598290598292, "grad_norm": 5.75, "learning_rate": 8.60530402074241e-06, "loss": 0.4653, "step": 560 }, { "epoch": 3.8974358974358974, "grad_norm": 2.984375, "learning_rate": 8.535533905932739e-06, "loss": 0.4276, "step": 570 }, { "epoch": 3.965811965811966, "grad_norm": 3.9375, "learning_rate": 8.46435894762922e-06, "loss": 0.423, "step": 580 }, { "epoch": 4.034188034188034, "grad_norm": 2.96875, "learning_rate": 8.39180742718334e-06, "loss": 0.3372, "step": 590 }, { "epoch": 4.102564102564102, "grad_norm": 3.25, "learning_rate": 8.317908172923207e-06, "loss": 0.29, "step": 600 }, { "epoch": 4.170940170940171, "grad_norm": 3.84375, "learning_rate": 8.242690548698611e-06, "loss": 0.2464, "step": 610 }, { "epoch": 4.239316239316239, "grad_norm": 4.28125, "learning_rate": 8.166184442213314e-06, "loss": 0.2754, "step": 620 }, { "epoch": 4.3076923076923075, "grad_norm": 2.796875, "learning_rate": 8.088420253149173e-06, "loss": 0.2699, "step": 630 }, { "epoch": 4.3760683760683765, "grad_norm": 2.796875, "learning_rate": 8.009428881086836e-06, "loss": 0.2621, "step": 640 }, { "epoch": 4.444444444444445, "grad_norm": 4.125, "learning_rate": 7.9292417132278e-06, "loss": 0.2688, "step": 650 }, { "epoch": 4.512820512820513, "grad_norm": 2.9375, "learning_rate": 7.847890611922721e-06, "loss": 0.2871, "step": 660 }, { "epoch": 4.581196581196581, "grad_norm": 2.78125, "learning_rate": 7.76540790201091e-06, "loss": 0.2439, "step": 670 }, { "epoch": 4.64957264957265, "grad_norm": 5.5625, "learning_rate": 7.68182635797606e-06, "loss": 0.2478, "step": 680 }, { "epoch": 4.717948717948718, "grad_norm": 5.09375, "learning_rate": 7.597179190923343e-06, "loss": 0.2385, "step": 690 }, { "epoch": 4.786324786324786, "grad_norm": 2.53125, "learning_rate": 7.511500035382943e-06, "loss": 0.2525, "step": 700 }, { "epoch": 4.854700854700854, "grad_norm": 3.734375, "learning_rate": 7.424822935945416e-06, "loss": 0.2448, "step": 710 }, { "epoch": 4.923076923076923, "grad_norm": 2.453125, "learning_rate": 7.33718233373407e-06, "loss": 0.2173, "step": 720 }, { "epoch": 4.9914529914529915, "grad_norm": 2.0625, "learning_rate": 7.248613052719793e-06, "loss": 0.1926, "step": 730 }, { "epoch": 5.05982905982906, "grad_norm": 2.046875, "learning_rate": 7.159150285883757e-06, "loss": 0.1754, "step": 740 }, { "epoch": 5.128205128205128, "grad_norm": 2.4375, "learning_rate": 7.0688295812334995e-06, "loss": 0.1334, "step": 750 }, { "epoch": 5.196581196581197, "grad_norm": 1.84375, "learning_rate": 6.977686827677926e-06, "loss": 0.147, "step": 760 }, { "epoch": 5.264957264957265, "grad_norm": 1.65625, "learning_rate": 6.885758240766867e-06, "loss": 0.1549, "step": 770 }, { "epoch": 5.333333333333333, "grad_norm": 2.3125, "learning_rate": 6.793080348300834e-06, "loss": 0.1312, "step": 780 }, { "epoch": 5.401709401709402, "grad_norm": 1.59375, "learning_rate": 6.69968997581671e-06, "loss": 0.1403, "step": 790 }, { "epoch": 5.47008547008547, "grad_norm": 2.21875, "learning_rate": 6.6056242319551315e-06, "loss": 0.1237, "step": 800 }, { "epoch": 5.538461538461538, "grad_norm": 3.046875, "learning_rate": 6.510920493715381e-06, "loss": 0.1233, "step": 810 }, { "epoch": 5.6068376068376065, "grad_norm": 0.87109375, "learning_rate": 6.415616391603639e-06, "loss": 0.1667, "step": 820 }, { "epoch": 5.6752136752136755, "grad_norm": 1.25, "learning_rate": 6.3197497946805205e-06, "loss": 0.1224, "step": 830 }, { "epoch": 5.743589743589744, "grad_norm": 2.421875, "learning_rate": 6.223358795513812e-06, "loss": 0.118, "step": 840 }, { "epoch": 5.811965811965812, "grad_norm": 1.3515625, "learning_rate": 6.126481695042392e-06, "loss": 0.0897, "step": 850 }, { "epoch": 5.880341880341881, "grad_norm": 1.5390625, "learning_rate": 6.029156987357373e-06, "loss": 0.1142, "step": 860 }, { "epoch": 5.948717948717949, "grad_norm": 1.78125, "learning_rate": 5.931423344406478e-06, "loss": 0.1542, "step": 870 }, { "epoch": 6.017094017094017, "grad_norm": 1.2890625, "learning_rate": 5.8333196006277536e-06, "loss": 0.0937, "step": 880 }, { "epoch": 6.085470085470085, "grad_norm": 1.75, "learning_rate": 5.734884737518714e-06, "loss": 0.0699, "step": 890 }, { "epoch": 6.153846153846154, "grad_norm": 0.96484375, "learning_rate": 5.636157868147054e-06, "loss": 0.0757, "step": 900 }, { "epoch": 6.222222222222222, "grad_norm": 0.84765625, "learning_rate": 5.537178221609088e-06, "loss": 0.0657, "step": 910 }, { "epoch": 6.2905982905982905, "grad_norm": 2.328125, "learning_rate": 5.437985127442065e-06, "loss": 0.0838, "step": 920 }, { "epoch": 6.358974358974359, "grad_norm": 1.015625, "learning_rate": 5.338617999996603e-06, "loss": 0.0876, "step": 930 }, { "epoch": 6.427350427350428, "grad_norm": 1.0, "learning_rate": 5.239116322775392e-06, "loss": 0.0652, "step": 940 }, { "epoch": 6.495726495726496, "grad_norm": 0.96484375, "learning_rate": 5.139519632744443e-06, "loss": 0.0843, "step": 950 }, { "epoch": 6.564102564102564, "grad_norm": 1.3828125, "learning_rate": 5.039867504623084e-06, "loss": 0.0677, "step": 960 }, { "epoch": 6.632478632478632, "grad_norm": 1.0390625, "learning_rate": 4.940199535158954e-06, "loss": 0.0764, "step": 970 }, { "epoch": 6.700854700854701, "grad_norm": 0.828125, "learning_rate": 4.8405553273942415e-06, "loss": 0.0642, "step": 980 }, { "epoch": 6.769230769230769, "grad_norm": 1.1796875, "learning_rate": 4.740974474929438e-06, "loss": 0.0444, "step": 990 }, { "epoch": 6.837606837606837, "grad_norm": 0.83203125, "learning_rate": 4.641496546190813e-06, "loss": 0.0713, "step": 1000 }, { "epoch": 6.837606837606837, "eval_loss": 0.19593119621276855, "eval_runtime": 5.1008, "eval_samples_per_second": 25.486, "eval_steps_per_second": 25.486, "step": 1000 }, { "epoch": 6.905982905982906, "grad_norm": 0.4765625, "learning_rate": 4.542161068707927e-06, "loss": 0.0494, "step": 1010 }, { "epoch": 6.9743589743589745, "grad_norm": 0.78125, "learning_rate": 4.443007513407368e-06, "loss": 0.0492, "step": 1020 }, { "epoch": 7.042735042735043, "grad_norm": 0.73046875, "learning_rate": 4.344075278928989e-06, "loss": 0.1084, "step": 1030 }, { "epoch": 7.111111111111111, "grad_norm": 0.78515625, "learning_rate": 4.245403675970877e-06, "loss": 0.0605, "step": 1040 }, { "epoch": 7.17948717948718, "grad_norm": 0.79296875, "learning_rate": 4.147031911669243e-06, "loss": 0.0422, "step": 1050 }, { "epoch": 7.247863247863248, "grad_norm": 1.796875, "learning_rate": 4.048999074019493e-06, "loss": 0.0388, "step": 1060 }, { "epoch": 7.316239316239316, "grad_norm": 0.88671875, "learning_rate": 3.951344116344606e-06, "loss": 0.0295, "step": 1070 }, { "epoch": 7.384615384615385, "grad_norm": 0.44140625, "learning_rate": 3.854105841817056e-06, "loss": 0.0545, "step": 1080 }, { "epoch": 7.452991452991453, "grad_norm": 0.64453125, "learning_rate": 3.7573228880403734e-06, "loss": 0.0337, "step": 1090 }, { "epoch": 7.521367521367521, "grad_norm": 0.5703125, "learning_rate": 3.661033711696501e-06, "loss": 0.0507, "step": 1100 }, { "epoch": 7.589743589743589, "grad_norm": 0.6328125, "learning_rate": 3.5652765732650523e-06, "loss": 0.0419, "step": 1110 }, { "epoch": 7.6581196581196584, "grad_norm": 1.21875, "learning_rate": 3.4700895218205026e-06, "loss": 0.0423, "step": 1120 }, { "epoch": 7.726495726495727, "grad_norm": 1.234375, "learning_rate": 3.375510379913418e-06, "loss": 0.0488, "step": 1130 }, { "epoch": 7.794871794871795, "grad_norm": 0.4140625, "learning_rate": 3.2815767285416576e-06, "loss": 0.0388, "step": 1140 }, { "epoch": 7.863247863247864, "grad_norm": 0.462890625, "learning_rate": 3.188325892217587e-06, "loss": 0.0197, "step": 1150 }, { "epoch": 7.931623931623932, "grad_norm": 0.390625, "learning_rate": 3.0957949241371845e-06, "loss": 0.0326, "step": 1160 }, { "epoch": 8.0, "grad_norm": 0.57421875, "learning_rate": 3.0040205914569664e-06, "loss": 0.0383, "step": 1170 }, { "epoch": 8.068376068376068, "grad_norm": 0.40234375, "learning_rate": 2.913039360684565e-06, "loss": 0.0296, "step": 1180 }, { "epoch": 8.136752136752136, "grad_norm": 0.482421875, "learning_rate": 2.822887383188775e-06, "loss": 0.0564, "step": 1190 }, { "epoch": 8.205128205128204, "grad_norm": 0.435546875, "learning_rate": 2.7336004808348094e-06, "loss": 0.0295, "step": 1200 }, { "epoch": 8.273504273504274, "grad_norm": 0.435546875, "learning_rate": 2.645214131750498e-06, "loss": 0.0194, "step": 1210 }, { "epoch": 8.341880341880342, "grad_norm": 0.2275390625, "learning_rate": 2.5577634562290567e-06, "loss": 0.0261, "step": 1220 }, { "epoch": 8.41025641025641, "grad_norm": 0.6171875, "learning_rate": 2.4712832027740545e-06, "loss": 0.0237, "step": 1230 }, { "epoch": 8.478632478632479, "grad_norm": 0.15625, "learning_rate": 2.385807734292097e-06, "loss": 0.037, "step": 1240 }, { "epoch": 8.547008547008547, "grad_norm": 0.63671875, "learning_rate": 2.3013710144387374e-06, "loss": 0.0241, "step": 1250 }, { "epoch": 8.615384615384615, "grad_norm": 0.53515625, "learning_rate": 2.218006594123028e-06, "loss": 0.0258, "step": 1260 }, { "epoch": 8.683760683760683, "grad_norm": 0.73828125, "learning_rate": 2.1357475981760704e-06, "loss": 0.0361, "step": 1270 }, { "epoch": 8.752136752136753, "grad_norm": 0.333984375, "learning_rate": 2.0546267121888863e-06, "loss": 0.0243, "step": 1280 }, { "epoch": 8.820512820512821, "grad_norm": 0.404296875, "learning_rate": 1.9746761695247803e-06, "loss": 0.0404, "step": 1290 }, { "epoch": 8.88888888888889, "grad_norm": 0.50390625, "learning_rate": 1.8959277385114516e-06, "loss": 0.0284, "step": 1300 }, { "epoch": 8.957264957264957, "grad_norm": 0.265625, "learning_rate": 1.8184127098178288e-06, "loss": 0.028, "step": 1310 }, { "epoch": 9.025641025641026, "grad_norm": 0.298828125, "learning_rate": 1.7421618840207576e-06, "loss": 0.0174, "step": 1320 }, { "epoch": 9.094017094017094, "grad_norm": 0.47265625, "learning_rate": 1.667205559366372e-06, "loss": 0.0293, "step": 1330 }, { "epoch": 9.162393162393162, "grad_norm": 0.2275390625, "learning_rate": 1.5935735197311204e-06, "loss": 0.0459, "step": 1340 }, { "epoch": 9.23076923076923, "grad_norm": 0.12890625, "learning_rate": 1.5212950227871292e-06, "loss": 0.0197, "step": 1350 }, { "epoch": 9.2991452991453, "grad_norm": 0.2470703125, "learning_rate": 1.4503987883766857e-06, "loss": 0.0208, "step": 1360 }, { "epoch": 9.367521367521368, "grad_norm": 0.400390625, "learning_rate": 1.3809129871004113e-06, "loss": 0.0332, "step": 1370 }, { "epoch": 9.435897435897436, "grad_norm": 0.625, "learning_rate": 1.312865229123681e-06, "loss": 0.0186, "step": 1380 }, { "epoch": 9.504273504273504, "grad_norm": 0.32421875, "learning_rate": 1.2462825532057394e-06, "loss": 0.0365, "step": 1390 }, { "epoch": 9.572649572649572, "grad_norm": 0.330078125, "learning_rate": 1.1811914159558374e-06, "loss": 0.0172, "step": 1400 }, { "epoch": 9.64102564102564, "grad_norm": 0.1943359375, "learning_rate": 1.117617681320729e-06, "loss": 0.0307, "step": 1410 }, { "epoch": 9.709401709401709, "grad_norm": 0.177734375, "learning_rate": 1.0555866103076212e-06, "loss": 0.0138, "step": 1420 }, { "epoch": 9.777777777777779, "grad_norm": 0.369140625, "learning_rate": 9.951228509467248e-07, "loss": 0.0204, "step": 1430 }, { "epoch": 9.846153846153847, "grad_norm": 0.2421875, "learning_rate": 9.362504284973683e-07, "loss": 0.017, "step": 1440 }, { "epoch": 9.914529914529915, "grad_norm": 0.2119140625, "learning_rate": 8.789927359015643e-07, "loss": 0.0236, "step": 1450 }, { "epoch": 9.982905982905983, "grad_norm": 0.296875, "learning_rate": 8.233725244888291e-07, "loss": 0.0234, "step": 1460 }, { "epoch": 10.051282051282051, "grad_norm": 0.3828125, "learning_rate": 7.694118949359553e-07, "loss": 0.015, "step": 1470 }, { "epoch": 10.11965811965812, "grad_norm": 0.1845703125, "learning_rate": 7.171322884852988e-07, "loss": 0.0209, "step": 1480 }, { "epoch": 10.188034188034187, "grad_norm": 0.20703125, "learning_rate": 6.665544784251232e-07, "loss": 0.0156, "step": 1490 }, { "epoch": 10.256410256410255, "grad_norm": 0.380859375, "learning_rate": 6.176985618353282e-07, "loss": 0.0245, "step": 1500 }, { "epoch": 10.324786324786325, "grad_norm": 0.5234375, "learning_rate": 5.705839516018818e-07, "loss": 0.0143, "step": 1510 }, { "epoch": 10.393162393162394, "grad_norm": 0.9375, "learning_rate": 5.252293687031196e-07, "loss": 0.0279, "step": 1520 }, { "epoch": 10.461538461538462, "grad_norm": 0.1884765625, "learning_rate": 4.816528347709614e-07, "loss": 0.0215, "step": 1530 }, { "epoch": 10.52991452991453, "grad_norm": 0.84765625, "learning_rate": 4.398716649300311e-07, "loss": 0.0145, "step": 1540 }, { "epoch": 10.598290598290598, "grad_norm": 0.6015625, "learning_rate": 3.999024609174812e-07, "loss": 0.0196, "step": 1550 }, { "epoch": 10.666666666666666, "grad_norm": 1.3671875, "learning_rate": 3.61761104486314e-07, "loss": 0.0271, "step": 1560 }, { "epoch": 10.735042735042736, "grad_norm": 0.70703125, "learning_rate": 3.2546275109475554e-07, "loss": 0.0179, "step": 1570 }, { "epoch": 10.803418803418804, "grad_norm": 0.796875, "learning_rate": 2.9102182388425106e-07, "loss": 0.0228, "step": 1580 }, { "epoch": 10.871794871794872, "grad_norm": 0.451171875, "learning_rate": 2.5845200794842154e-07, "loss": 0.0217, "step": 1590 }, { "epoch": 10.94017094017094, "grad_norm": 0.95703125, "learning_rate": 2.2776624489530664e-07, "loss": 0.0212, "step": 1600 }, { "epoch": 11.008547008547009, "grad_norm": 1.6796875, "learning_rate": 1.9897672770501198e-07, "loss": 0.0492, "step": 1610 }, { "epoch": 11.076923076923077, "grad_norm": 2.171875, "learning_rate": 1.7209489588483396e-07, "loss": 0.0265, "step": 1620 }, { "epoch": 11.145299145299145, "grad_norm": 2.171875, "learning_rate": 1.4713143092377534e-07, "loss": 0.0188, "step": 1630 }, { "epoch": 11.213675213675213, "grad_norm": 2.625, "learning_rate": 1.2409625204825802e-07, "loss": 0.025, "step": 1640 }, { "epoch": 11.282051282051283, "grad_norm": 3.328125, "learning_rate": 1.0299851228072089e-07, "loss": 0.0186, "step": 1650 }, { "epoch": 11.350427350427351, "grad_norm": 5.40625, "learning_rate": 8.384659480266733e-08, "loss": 0.0257, "step": 1660 }, { "epoch": 11.418803418803419, "grad_norm": 4.09375, "learning_rate": 6.664810962361268e-08, "loss": 0.0213, "step": 1670 }, { "epoch": 11.487179487179487, "grad_norm": 3.125, "learning_rate": 5.1409890557246876e-08, "loss": 0.0137, "step": 1680 }, { "epoch": 11.555555555555555, "grad_norm": 5.21875, "learning_rate": 3.813799250602046e-08, "loss": 0.0164, "step": 1690 }, { "epoch": 11.623931623931623, "grad_norm": 0.328125, "learning_rate": 2.683768905523243e-08, "loss": 0.0197, "step": 1700 }, { "epoch": 11.692307692307692, "grad_norm": 0.796875, "learning_rate": 1.7513470377570896e-08, "loss": 0.0246, "step": 1710 }, { "epoch": 11.760683760683762, "grad_norm": 0.92578125, "learning_rate": 1.016904144894304e-08, "loss": 0.0295, "step": 1720 }, { "epoch": 11.82905982905983, "grad_norm": 1.3671875, "learning_rate": 4.807320576307728e-09, "loss": 0.029, "step": 1730 }, { "epoch": 11.897435897435898, "grad_norm": 2.203125, "learning_rate": 1.4304382380819771e-09, "loss": 0.0177, "step": 1740 }, { "epoch": 11.965811965811966, "grad_norm": 1.0234375, "learning_rate": 3.9736237600895846e-11, "loss": 0.0261, "step": 1750 } ], "logging_steps": 10, "max_steps": 1752, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.576163802677248e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }