|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 11.97948717948718, |
|
"eval_steps": 1000, |
|
"global_step": 1752, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06837606837606838, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 5.681818181818182e-07, |
|
"loss": 2.6042, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13675213675213677, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.1363636363636364e-06, |
|
"loss": 2.577, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.7045454545454546e-06, |
|
"loss": 2.4842, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.27350427350427353, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 2.4127, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 21.75, |
|
"learning_rate": 2.8409090909090916e-06, |
|
"loss": 2.2065, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 28.125, |
|
"learning_rate": 3.409090909090909e-06, |
|
"loss": 2.2638, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.47863247863247865, |
|
"grad_norm": 29.5, |
|
"learning_rate": 3.9772727272727275e-06, |
|
"loss": 2.0591, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5470085470085471, |
|
"grad_norm": 20.5, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.9693, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 19.875, |
|
"learning_rate": 5.113636363636364e-06, |
|
"loss": 1.9559, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"grad_norm": 5.125, |
|
"learning_rate": 5.681818181818183e-06, |
|
"loss": 1.8579, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7521367521367521, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.8601, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 1.7222, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 7.386363636363637e-06, |
|
"loss": 1.7724, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9572649572649573, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 7.954545454545455e-06, |
|
"loss": 1.8647, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 8.522727272727273e-06, |
|
"loss": 1.7017, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0940170940170941, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.6883, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1623931623931625, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 9.65909090909091e-06, |
|
"loss": 1.5485, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 9.999841055681184e-06, |
|
"loss": 1.5548, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2991452991452992, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 9.998053048145735e-06, |
|
"loss": 1.602, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3675213675213675, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 9.994279065509094e-06, |
|
"loss": 1.5151, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 9.988520607362297e-06, |
|
"loss": 1.4208, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5042735042735043, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 9.98077996182722e-06, |
|
"loss": 1.4954, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5726495726495726, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 9.971060204647384e-06, |
|
"loss": 1.3399, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 5.5, |
|
"learning_rate": 9.959365197965824e-06, |
|
"loss": 1.2554, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 9.945699588790455e-06, |
|
"loss": 1.3161, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 9.930068807147585e-06, |
|
"loss": 1.3603, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 6.375, |
|
"learning_rate": 9.912479063924309e-06, |
|
"loss": 1.2576, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.9145299145299144, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 9.8929373484006e-06, |
|
"loss": 1.207, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.982905982905983, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 9.871451425472128e-06, |
|
"loss": 1.232, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 7.125, |
|
"learning_rate": 9.848029832564875e-06, |
|
"loss": 1.0106, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.1196581196581197, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 9.822681876242797e-06, |
|
"loss": 0.9484, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.1880341880341883, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 9.795417628509857e-06, |
|
"loss": 1.0094, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.2564102564102564, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 9.766247922807927e-06, |
|
"loss": 0.9317, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.324786324786325, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 9.73518434971211e-06, |
|
"loss": 0.8936, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.393162393162393, |
|
"grad_norm": 7.375, |
|
"learning_rate": 9.702239252325237e-06, |
|
"loss": 0.9004, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 9.667425721373333e-06, |
|
"loss": 0.9335, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.52991452991453, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 9.630757590004023e-06, |
|
"loss": 0.8982, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.5982905982905984, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 9.592249428289935e-06, |
|
"loss": 0.8541, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 9.551916537439282e-06, |
|
"loss": 0.8105, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.735042735042735, |
|
"grad_norm": 5.125, |
|
"learning_rate": 9.50977494371594e-06, |
|
"loss": 0.8308, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.8034188034188032, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 9.465841392071396e-06, |
|
"loss": 0.8515, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.871794871794872, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 9.420133339491171e-06, |
|
"loss": 0.6671, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.9401709401709404, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 9.372668948058276e-06, |
|
"loss": 0.7728, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.0085470085470085, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 9.323467077736513e-06, |
|
"loss": 0.6978, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 9.272547278876475e-06, |
|
"loss": 0.561, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.1452991452991452, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 9.219929784447232e-06, |
|
"loss": 0.5354, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.213675213675214, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 9.16563550199674e-06, |
|
"loss": 0.5613, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.282051282051282, |
|
"grad_norm": 6.5, |
|
"learning_rate": 9.109686005344258e-06, |
|
"loss": 0.576, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.3504273504273505, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 9.052103526007976e-06, |
|
"loss": 0.492, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.4188034188034186, |
|
"grad_norm": 7.5, |
|
"learning_rate": 8.992910944371343e-06, |
|
"loss": 0.5087, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.4871794871794872, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 8.932131780591542e-06, |
|
"loss": 0.476, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 8.869790185253766e-06, |
|
"loss": 0.4111, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.623931623931624, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 8.805910929774989e-06, |
|
"loss": 0.4426, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 8.740519396561045e-06, |
|
"loss": 0.4171, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.7606837606837606, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 8.673641568920944e-06, |
|
"loss": 0.445, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.8290598290598292, |
|
"grad_norm": 5.75, |
|
"learning_rate": 8.60530402074241e-06, |
|
"loss": 0.4653, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.8974358974358974, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.4276, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.965811965811966, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 8.46435894762922e-06, |
|
"loss": 0.423, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.034188034188034, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 8.39180742718334e-06, |
|
"loss": 0.3372, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 3.25, |
|
"learning_rate": 8.317908172923207e-06, |
|
"loss": 0.29, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.170940170940171, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 8.242690548698611e-06, |
|
"loss": 0.2464, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.239316239316239, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 8.166184442213314e-06, |
|
"loss": 0.2754, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 8.088420253149173e-06, |
|
"loss": 0.2699, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.3760683760683765, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 8.009428881086836e-06, |
|
"loss": 0.2621, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 4.125, |
|
"learning_rate": 7.9292417132278e-06, |
|
"loss": 0.2688, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.512820512820513, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 7.847890611922721e-06, |
|
"loss": 0.2871, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.581196581196581, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 7.76540790201091e-06, |
|
"loss": 0.2439, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.64957264957265, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 7.68182635797606e-06, |
|
"loss": 0.2478, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.717948717948718, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 7.597179190923343e-06, |
|
"loss": 0.2385, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.786324786324786, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.511500035382943e-06, |
|
"loss": 0.2525, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.854700854700854, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 7.424822935945416e-06, |
|
"loss": 0.2448, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 7.33718233373407e-06, |
|
"loss": 0.2173, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.9914529914529915, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.248613052719793e-06, |
|
"loss": 0.1926, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.05982905982906, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 7.159150285883757e-06, |
|
"loss": 0.1754, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.0688295812334995e-06, |
|
"loss": 0.1334, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.196581196581197, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 6.977686827677926e-06, |
|
"loss": 0.147, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.264957264957265, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.885758240766867e-06, |
|
"loss": 0.1549, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 6.793080348300834e-06, |
|
"loss": 0.1312, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.401709401709402, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 6.69968997581671e-06, |
|
"loss": 0.1403, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.47008547008547, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.6056242319551315e-06, |
|
"loss": 0.1237, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 6.510920493715381e-06, |
|
"loss": 0.1233, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.6068376068376065, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 6.415616391603639e-06, |
|
"loss": 0.1667, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.6752136752136755, |
|
"grad_norm": 1.25, |
|
"learning_rate": 6.3197497946805205e-06, |
|
"loss": 0.1224, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 5.743589743589744, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 6.223358795513812e-06, |
|
"loss": 0.118, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 5.811965811965812, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 6.126481695042392e-06, |
|
"loss": 0.0897, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.880341880341881, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 6.029156987357373e-06, |
|
"loss": 0.1142, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 5.948717948717949, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.931423344406478e-06, |
|
"loss": 0.1542, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.017094017094017, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 5.8333196006277536e-06, |
|
"loss": 0.0937, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.085470085470085, |
|
"grad_norm": 1.75, |
|
"learning_rate": 5.734884737518714e-06, |
|
"loss": 0.0699, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 5.636157868147054e-06, |
|
"loss": 0.0757, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.537178221609088e-06, |
|
"loss": 0.0657, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.2905982905982905, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.437985127442065e-06, |
|
"loss": 0.0838, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.358974358974359, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.338617999996603e-06, |
|
"loss": 0.0876, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.427350427350428, |
|
"grad_norm": 1.0, |
|
"learning_rate": 5.239116322775392e-06, |
|
"loss": 0.0652, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.495726495726496, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 5.139519632744443e-06, |
|
"loss": 0.0843, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.564102564102564, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 5.039867504623084e-06, |
|
"loss": 0.0677, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 6.632478632478632, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.940199535158954e-06, |
|
"loss": 0.0764, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 6.700854700854701, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 4.8405553273942415e-06, |
|
"loss": 0.0642, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 6.769230769230769, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.740974474929438e-06, |
|
"loss": 0.0444, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 6.837606837606837, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.641496546190813e-06, |
|
"loss": 0.0713, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.837606837606837, |
|
"eval_loss": 0.19593119621276855, |
|
"eval_runtime": 5.1008, |
|
"eval_samples_per_second": 25.486, |
|
"eval_steps_per_second": 25.486, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.905982905982906, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 4.542161068707927e-06, |
|
"loss": 0.0494, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 6.9743589743589745, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.443007513407368e-06, |
|
"loss": 0.0492, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.042735042735043, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.344075278928989e-06, |
|
"loss": 0.1084, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 7.111111111111111, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.245403675970877e-06, |
|
"loss": 0.0605, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.147031911669243e-06, |
|
"loss": 0.0422, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.247863247863248, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.048999074019493e-06, |
|
"loss": 0.0388, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 7.316239316239316, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.951344116344606e-06, |
|
"loss": 0.0295, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 7.384615384615385, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 3.854105841817056e-06, |
|
"loss": 0.0545, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 7.452991452991453, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 3.7573228880403734e-06, |
|
"loss": 0.0337, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 7.521367521367521, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 3.661033711696501e-06, |
|
"loss": 0.0507, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.589743589743589, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 3.5652765732650523e-06, |
|
"loss": 0.0419, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 7.6581196581196584, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.4700895218205026e-06, |
|
"loss": 0.0423, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 7.726495726495727, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 3.375510379913418e-06, |
|
"loss": 0.0488, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 7.794871794871795, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.2815767285416576e-06, |
|
"loss": 0.0388, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 7.863247863247864, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 3.188325892217587e-06, |
|
"loss": 0.0197, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 7.931623931623932, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 3.0957949241371845e-06, |
|
"loss": 0.0326, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 3.0040205914569664e-06, |
|
"loss": 0.0383, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 8.068376068376068, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 2.913039360684565e-06, |
|
"loss": 0.0296, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 8.136752136752136, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 2.822887383188775e-06, |
|
"loss": 0.0564, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.7336004808348094e-06, |
|
"loss": 0.0295, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.273504273504274, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.645214131750498e-06, |
|
"loss": 0.0194, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 8.341880341880342, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 2.5577634562290567e-06, |
|
"loss": 0.0261, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 8.41025641025641, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 2.4712832027740545e-06, |
|
"loss": 0.0237, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 8.478632478632479, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 2.385807734292097e-06, |
|
"loss": 0.037, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 8.547008547008547, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 2.3013710144387374e-06, |
|
"loss": 0.0241, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.615384615384615, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 2.218006594123028e-06, |
|
"loss": 0.0258, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 8.683760683760683, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.1357475981760704e-06, |
|
"loss": 0.0361, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 8.752136752136753, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.0546267121888863e-06, |
|
"loss": 0.0243, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 8.820512820512821, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.9746761695247803e-06, |
|
"loss": 0.0404, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.8959277385114516e-06, |
|
"loss": 0.0284, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.957264957264957, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 1.8184127098178288e-06, |
|
"loss": 0.028, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 9.025641025641026, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.7421618840207576e-06, |
|
"loss": 0.0174, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 9.094017094017094, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 1.667205559366372e-06, |
|
"loss": 0.0293, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 9.162393162393162, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 1.5935735197311204e-06, |
|
"loss": 0.0459, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 1.5212950227871292e-06, |
|
"loss": 0.0197, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 9.2991452991453, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 1.4503987883766857e-06, |
|
"loss": 0.0208, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 9.367521367521368, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.3809129871004113e-06, |
|
"loss": 0.0332, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 9.435897435897436, |
|
"grad_norm": 0.625, |
|
"learning_rate": 1.312865229123681e-06, |
|
"loss": 0.0186, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 9.504273504273504, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.2462825532057394e-06, |
|
"loss": 0.0365, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 9.572649572649572, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.1811914159558374e-06, |
|
"loss": 0.0172, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.64102564102564, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.117617681320729e-06, |
|
"loss": 0.0307, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 9.709401709401709, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 1.0555866103076212e-06, |
|
"loss": 0.0138, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.951228509467248e-07, |
|
"loss": 0.0204, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 9.846153846153847, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 9.362504284973683e-07, |
|
"loss": 0.017, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 9.914529914529915, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 8.789927359015643e-07, |
|
"loss": 0.0236, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 9.982905982905983, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 8.233725244888291e-07, |
|
"loss": 0.0234, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 10.051282051282051, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 7.694118949359553e-07, |
|
"loss": 0.015, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 10.11965811965812, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 7.171322884852988e-07, |
|
"loss": 0.0209, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 10.188034188034187, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 6.665544784251232e-07, |
|
"loss": 0.0156, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 6.176985618353282e-07, |
|
"loss": 0.0245, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 10.324786324786325, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 5.705839516018818e-07, |
|
"loss": 0.0143, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 10.393162393162394, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 5.252293687031196e-07, |
|
"loss": 0.0279, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 10.461538461538462, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 4.816528347709614e-07, |
|
"loss": 0.0215, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 10.52991452991453, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.398716649300311e-07, |
|
"loss": 0.0145, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 10.598290598290598, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 3.999024609174812e-07, |
|
"loss": 0.0196, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 10.666666666666666, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 3.61761104486314e-07, |
|
"loss": 0.0271, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 10.735042735042736, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 3.2546275109475554e-07, |
|
"loss": 0.0179, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 10.803418803418804, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 2.9102182388425106e-07, |
|
"loss": 0.0228, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 10.871794871794872, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 2.5845200794842154e-07, |
|
"loss": 0.0217, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 10.94017094017094, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.2776624489530664e-07, |
|
"loss": 0.0212, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 11.008547008547009, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.9897672770501198e-07, |
|
"loss": 0.0492, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 11.076923076923077, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.7209489588483396e-07, |
|
"loss": 0.0265, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 11.145299145299145, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.4713143092377534e-07, |
|
"loss": 0.0188, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 11.213675213675213, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.2409625204825802e-07, |
|
"loss": 0.025, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 11.282051282051283, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1.0299851228072089e-07, |
|
"loss": 0.0186, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 11.350427350427351, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 8.384659480266733e-08, |
|
"loss": 0.0257, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 11.418803418803419, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 6.664810962361268e-08, |
|
"loss": 0.0213, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 11.487179487179487, |
|
"grad_norm": 3.125, |
|
"learning_rate": 5.1409890557246876e-08, |
|
"loss": 0.0137, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 11.555555555555555, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.813799250602046e-08, |
|
"loss": 0.0164, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 11.623931623931623, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.683768905523243e-08, |
|
"loss": 0.0197, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 11.692307692307692, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.7513470377570896e-08, |
|
"loss": 0.0246, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 11.760683760683762, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.016904144894304e-08, |
|
"loss": 0.0295, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 11.82905982905983, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.807320576307728e-09, |
|
"loss": 0.029, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 11.897435897435898, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.4304382380819771e-09, |
|
"loss": 0.0177, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 11.965811965811966, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.9736237600895846e-11, |
|
"loss": 0.0261, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 11.97948717948718, |
|
"step": 1752, |
|
"total_flos": 9.576163802677248e+16, |
|
"train_loss": 0.4554890414942311, |
|
"train_runtime": 1838.2954, |
|
"train_samples_per_second": 7.638, |
|
"train_steps_per_second": 0.953 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1752, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 12, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.576163802677248e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|