|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 31.91571553994732, |
|
"eval_steps": 2000, |
|
"global_step": 4544, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07023705004389816, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 2.197802197802198e-07, |
|
"loss": 2.4857, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.14047410008779632, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.395604395604396e-07, |
|
"loss": 2.5113, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21071115013169447, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 6.593406593406594e-07, |
|
"loss": 2.4922, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.28094820017559263, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 8.791208791208792e-07, |
|
"loss": 2.4394, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.35118525021949076, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.098901098901099e-06, |
|
"loss": 2.4773, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.42142230026338895, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 1.3186813186813187e-06, |
|
"loss": 2.3524, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4916593503072871, |
|
"grad_norm": 15.125, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 2.42, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5618964003511853, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.7582417582417585e-06, |
|
"loss": 2.3288, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6321334503950834, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.9780219780219782e-06, |
|
"loss": 2.2519, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7023705004389815, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.197802197802198e-06, |
|
"loss": 2.2021, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7726075504828798, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.4175824175824177e-06, |
|
"loss": 2.2151, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8428446005267779, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.6373626373626375e-06, |
|
"loss": 2.1102, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.913081650570676, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 2.1034, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9833187006145742, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 2.0737, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0535557506584723, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.2967032967032968e-06, |
|
"loss": 2.0345, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1237928007023705, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.516483516483517e-06, |
|
"loss": 1.9568, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.7362637362637367e-06, |
|
"loss": 1.9494, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2642669007901668, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.9560439560439565e-06, |
|
"loss": 1.8932, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.334503950834065, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.175824175824177e-06, |
|
"loss": 1.9219, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.404741000877963, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.395604395604396e-06, |
|
"loss": 1.9751, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4749780509218613, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 1.8729, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5452151009657595, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.8351648351648355e-06, |
|
"loss": 1.885, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6154521510096576, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 5.054945054945055e-06, |
|
"loss": 1.8549, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6856892010535558, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 5.274725274725275e-06, |
|
"loss": 1.8736, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.755926251097454, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.494505494505495e-06, |
|
"loss": 1.8388, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.826163301141352, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 1.8676, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.89640035118525, |
|
"grad_norm": 2.0, |
|
"learning_rate": 5.934065934065935e-06, |
|
"loss": 1.8181, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.9666374012291485, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 1.8007, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.0368744512730466, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 6.373626373626373e-06, |
|
"loss": 1.7727, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.1071115013169446, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 6.5934065934065935e-06, |
|
"loss": 1.7038, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.177348551360843, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.813186813186814e-06, |
|
"loss": 1.7398, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.247585601404741, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 7.032967032967034e-06, |
|
"loss": 1.7468, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.317822651448639, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 7.252747252747253e-06, |
|
"loss": 1.7326, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 7.472527472527473e-06, |
|
"loss": 1.7779, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.4582967515364356, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 1.6381, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.5285338015803336, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 7.912087912087913e-06, |
|
"loss": 1.6605, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.5987708516242316, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 8.131868131868132e-06, |
|
"loss": 1.6683, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.66900790166813, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 8.351648351648353e-06, |
|
"loss": 1.6819, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.739244951712028, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 1.7072, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.809482001755926, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 8.791208791208792e-06, |
|
"loss": 1.6747, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.8797190517998246, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 9.010989010989011e-06, |
|
"loss": 1.6596, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.9499561018437226, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 1.65, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.0201931518876206, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.450549450549452e-06, |
|
"loss": 1.6037, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.090430201931519, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 9.670329670329671e-06, |
|
"loss": 1.6015, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.160667251975417, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.890109890109892e-06, |
|
"loss": 1.5708, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.230904302019315, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 9.99996310691202e-06, |
|
"loss": 1.5778, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.3011413520632136, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 9.999667965474806e-06, |
|
"loss": 1.5528, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.3713784021071116, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 9.9990777000222e-06, |
|
"loss": 1.5539, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.4416154521510096, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.998192345396817e-06, |
|
"loss": 1.5289, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.511852502194908, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.997011953860014e-06, |
|
"loss": 1.5335, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.582089552238806, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.99553659508879e-06, |
|
"loss": 1.5565, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.652326602282704, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.993766356171694e-06, |
|
"loss": 1.5414, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.722563652326602, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 9.991701341603667e-06, |
|
"loss": 1.5271, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.7928007023705006, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.989341673279881e-06, |
|
"loss": 1.4782, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.8630377524143986, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.986687490488545e-06, |
|
"loss": 1.5165, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.9332748024582966, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.98373894990268e-06, |
|
"loss": 1.507, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.003511852502195, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 9.980496225570869e-06, |
|
"loss": 1.5207, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.073748902546093, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.976959508906992e-06, |
|
"loss": 1.4324, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.143985952589992, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 9.973129008678915e-06, |
|
"loss": 1.4171, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.214223002633889, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.969004950996175e-06, |
|
"loss": 1.4378, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.284460052677788, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.964587579296631e-06, |
|
"loss": 1.3886, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.354697102721686, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.959877154332095e-06, |
|
"loss": 1.4154, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.424934152765584, |
|
"grad_norm": 1.625, |
|
"learning_rate": 9.954873954152933e-06, |
|
"loss": 1.4231, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.495171202809482, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.949578274091666e-06, |
|
"loss": 1.3745, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.565408252853381, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.943990426745525e-06, |
|
"loss": 1.3663, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.635645302897278, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.938110741958003e-06, |
|
"loss": 1.4177, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 9.931939566799385e-06, |
|
"loss": 1.412, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.776119402985074, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.925477265546258e-06, |
|
"loss": 1.4155, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.846356453028973, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 9.918724219660013e-06, |
|
"loss": 1.4241, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.916593503072871, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 9.911680827764329e-06, |
|
"loss": 1.3562, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.98683055311677, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.90434750562163e-06, |
|
"loss": 1.3542, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.057067603160667, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.896724686108561e-06, |
|
"loss": 1.305, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.127304653204566, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.888812819190419e-06, |
|
"loss": 1.314, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.197541703248463, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 9.88061237189461e-06, |
|
"loss": 1.2962, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.267778753292362, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.872123828283063e-06, |
|
"loss": 1.2993, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.33801580333626, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 9.863347689423666e-06, |
|
"loss": 1.2649, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.408252853380158, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.854284473360694e-06, |
|
"loss": 1.2943, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.478489903424056, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.84493471508421e-06, |
|
"loss": 1.2512, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.548726953467955, |
|
"grad_norm": 1.375, |
|
"learning_rate": 9.835298966498511e-06, |
|
"loss": 1.3046, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.618964003511852, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.82537779638953e-06, |
|
"loss": 1.2338, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.689201053555751, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 9.815171790391269e-06, |
|
"loss": 1.2921, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.759438103599649, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.804681550951228e-06, |
|
"loss": 1.2306, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.829675153643547, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 9.793907697294844e-06, |
|
"loss": 1.2304, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 5.899912203687445, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.782850865388941e-06, |
|
"loss": 1.2693, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 5.970149253731344, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 9.77151170790419e-06, |
|
"loss": 1.2642, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.040386303775241, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.759890894176574e-06, |
|
"loss": 1.2702, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.11062335381914, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 9.747989110167887e-06, |
|
"loss": 1.2085, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.180860403863038, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 9.735807058425241e-06, |
|
"loss": 1.1754, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.251097453906936, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 9.723345458039595e-06, |
|
"loss": 1.1378, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.321334503950834, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.710605044603305e-06, |
|
"loss": 1.1588, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.391571553994732, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 9.697586570166707e-06, |
|
"loss": 1.1934, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.46180860403863, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.684290803193721e-06, |
|
"loss": 1.1504, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.532045654082529, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.670718528516495e-06, |
|
"loss": 1.1152, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.602282704126427, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.65687054728907e-06, |
|
"loss": 1.1628, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.672519754170325, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 9.642747676940094e-06, |
|
"loss": 1.1299, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.742756804214223, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.62835075112457e-06, |
|
"loss": 1.1635, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 6.812993854258121, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.61368061967464e-06, |
|
"loss": 1.1673, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 6.883230904302019, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.598738148549434e-06, |
|
"loss": 1.1281, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 6.953467954345918, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.583524219783938e-06, |
|
"loss": 1.1617, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 7.023705004389815, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.56803973143694e-06, |
|
"loss": 1.1436, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.093942054433714, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.552285597538014e-06, |
|
"loss": 1.0962, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 7.164179104477612, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 9.536262748033564e-06, |
|
"loss": 1.0893, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.23441615452151, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.519972128731937e-06, |
|
"loss": 1.0618, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 7.304653204565408, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.503414701247587e-06, |
|
"loss": 1.0986, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 7.374890254609307, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.486591442944313e-06, |
|
"loss": 1.1748, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.445127304653204, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 9.469503346877569e-06, |
|
"loss": 1.1092, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 7.515364354697103, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 9.452151421735846e-06, |
|
"loss": 1.1099, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 7.585601404741001, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 9.434536691781125e-06, |
|
"loss": 1.0662, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 7.655838454784899, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 9.416660196788423e-06, |
|
"loss": 1.0551, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 7.726075504828797, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.39852299198441e-06, |
|
"loss": 1.0643, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.796312554872696, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.380126147985122e-06, |
|
"loss": 1.0576, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 7.866549604916593, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 9.36147075073277e-06, |
|
"loss": 1.0703, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 7.936786654960492, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 9.34255790143163e-06, |
|
"loss": 1.0488, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 8.00702370500439, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 9.323388716483046e-06, |
|
"loss": 1.0843, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 8.077260755048288, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.303964327419524e-06, |
|
"loss": 1.0359, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 8.147497805092186, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.284285880837947e-06, |
|
"loss": 1.0227, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 8.217734855136085, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 9.264354538331886e-06, |
|
"loss": 0.997, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 8.287971905179983, |
|
"grad_norm": 0.875, |
|
"learning_rate": 9.244171476423037e-06, |
|
"loss": 1.0194, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 8.35820895522388, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 9.223737886491771e-06, |
|
"loss": 1.0264, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 8.428446005267778, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.203054974706807e-06, |
|
"loss": 1.037, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.498683055311677, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 9.182123961954016e-06, |
|
"loss": 1.0202, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 8.568920105355575, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 9.160946083764353e-06, |
|
"loss": 1.0577, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 8.639157155399474, |
|
"grad_norm": 0.75, |
|
"learning_rate": 9.13952259024092e-06, |
|
"loss": 1.009, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 8.709394205443372, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 9.117854745985183e-06, |
|
"loss": 0.9763, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 8.779631255487269, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.095943830022323e-06, |
|
"loss": 1.0406, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.849868305531167, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.073791135725722e-06, |
|
"loss": 1.0093, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 8.920105355575066, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 9.051397970740638e-06, |
|
"loss": 1.0146, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 8.990342405618964, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.028765656907005e-06, |
|
"loss": 1.0404, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 9.060579455662863, |
|
"grad_norm": 0.75, |
|
"learning_rate": 9.005895530181406e-06, |
|
"loss": 0.9933, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 9.130816505706761, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.982788940558216e-06, |
|
"loss": 0.9701, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.201053555750658, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.959447251989914e-06, |
|
"loss": 0.9554, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 9.271290605794556, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.935871842306569e-06, |
|
"loss": 1.0002, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 9.341527655838455, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 8.912064103134505e-06, |
|
"loss": 0.999, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.888025439814169e-06, |
|
"loss": 0.9726, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 9.482001755926252, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.863757271317154e-06, |
|
"loss": 1.0067, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 9.552238805970148, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.839261030162459e-06, |
|
"loss": 1.0181, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 9.622475856014047, |
|
"grad_norm": 0.875, |
|
"learning_rate": 8.814538162331913e-06, |
|
"loss": 0.9688, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 9.692712906057945, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 8.789590127184837e-06, |
|
"loss": 0.9309, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 9.762949956101844, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.764418397371888e-06, |
|
"loss": 0.987, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 9.833187006145742, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 8.739024458748128e-06, |
|
"loss": 0.9292, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.90342405618964, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 8.713409810285327e-06, |
|
"loss": 0.9909, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 9.973661106233537, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.687575963983477e-06, |
|
"loss": 0.9914, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 10.043898156277436, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 8.661524444781531e-06, |
|
"loss": 0.9369, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 10.114135206321334, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 8.635256790467402e-06, |
|
"loss": 0.9573, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 10.184372256365233, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 8.60877455158718e-06, |
|
"loss": 0.9491, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 10.254609306409131, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.582079291353607e-06, |
|
"loss": 0.8985, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 10.32484635645303, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 8.555172585553804e-06, |
|
"loss": 0.913, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 10.395083406496926, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 8.528056022456256e-06, |
|
"loss": 0.9727, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 10.465320456540825, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 8.500731202717056e-06, |
|
"loss": 0.9723, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 10.535557506584723, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 8.473199739285416e-06, |
|
"loss": 0.9626, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 10.605794556628622, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 8.445463257308463e-06, |
|
"loss": 0.9487, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 10.67603160667252, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 8.417523394035316e-06, |
|
"loss": 0.9832, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 10.746268656716419, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 8.389381798720417e-06, |
|
"loss": 0.9621, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 10.816505706760315, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 8.361040132526204e-06, |
|
"loss": 0.9222, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 10.886742756804214, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 8.332500068425038e-06, |
|
"loss": 0.9183, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 10.956979806848112, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 8.303763291100459e-06, |
|
"loss": 0.9173, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 11.02721685689201, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.274831496847735e-06, |
|
"loss": 0.9582, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 11.09745390693591, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 8.245706393473734e-06, |
|
"loss": 0.907, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 11.167690956979808, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 8.216389700196116e-06, |
|
"loss": 0.9339, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 11.237928007023704, |
|
"grad_norm": 4.375, |
|
"learning_rate": 8.186883147541846e-06, |
|
"loss": 0.9417, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 11.308165057067603, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 8.157188477245048e-06, |
|
"loss": 0.8978, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 11.378402107111501, |
|
"grad_norm": 4.625, |
|
"learning_rate": 8.12730744214419e-06, |
|
"loss": 0.9335, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 11.4486391571554, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 8.097241806078616e-06, |
|
"loss": 0.9623, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 11.518876207199298, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 8.066993343784427e-06, |
|
"loss": 0.9691, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 11.589113257243195, |
|
"grad_norm": 8.625, |
|
"learning_rate": 8.036563840789726e-06, |
|
"loss": 0.9208, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 11.659350307287093, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 8.005955093309217e-06, |
|
"loss": 0.9352, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 11.729587357330992, |
|
"grad_norm": 8.625, |
|
"learning_rate": 7.975168908138174e-06, |
|
"loss": 0.911, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 11.79982440737489, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 7.944207102545795e-06, |
|
"loss": 0.9109, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 11.870061457418789, |
|
"grad_norm": 8.375, |
|
"learning_rate": 7.913071504167925e-06, |
|
"loss": 0.946, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 11.940298507462687, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 7.881763950899175e-06, |
|
"loss": 0.8947, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 12.010535557506584, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.850286290784437e-06, |
|
"loss": 0.8951, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 12.080772607550482, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 7.81864038190979e-06, |
|
"loss": 0.893, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 12.151009657594381, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 7.786828092292821e-06, |
|
"loss": 0.8729, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 12.22124670763828, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 7.754851299772362e-06, |
|
"loss": 0.8704, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 12.291483757682178, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.722711891897641e-06, |
|
"loss": 0.8639, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 12.361720807726076, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 7.690411765816864e-06, |
|
"loss": 0.8624, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 12.431957857769973, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 7.657952828165225e-06, |
|
"loss": 0.878, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 12.502194907813871, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.625336994952364e-06, |
|
"loss": 0.8555, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 12.57243195785777, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 7.592566191449262e-06, |
|
"loss": 0.8889, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 12.642669007901668, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 7.559642352074606e-06, |
|
"loss": 0.8598, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 12.712906057945567, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.526567420280585e-06, |
|
"loss": 0.8856, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 12.783143107989464, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.4933433484381905e-06, |
|
"loss": 0.8677, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 12.853380158033362, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 7.459972097721954e-06, |
|
"loss": 0.9036, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 12.92361720807726, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.4264556379941895e-06, |
|
"loss": 0.8607, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 12.993854258121159, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.392795947688715e-06, |
|
"loss": 0.8699, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 13.064091308165057, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.3589950136940645e-06, |
|
"loss": 0.8343, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 13.134328358208956, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 7.325054831236211e-06, |
|
"loss": 0.8305, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 13.204565408252853, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.2909774037607775e-06, |
|
"loss": 0.8039, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 13.274802458296751, |
|
"grad_norm": 3.0, |
|
"learning_rate": 7.256764742814796e-06, |
|
"loss": 0.8274, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 13.34503950834065, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.222418867927948e-06, |
|
"loss": 0.8126, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 13.415276558384548, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.187941806493372e-06, |
|
"loss": 0.8155, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 13.485513608428446, |
|
"grad_norm": 2.125, |
|
"learning_rate": 7.153335593647974e-06, |
|
"loss": 0.8346, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 13.555750658472345, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.118602272152308e-06, |
|
"loss": 0.8275, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 13.625987708516242, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.083743892269987e-06, |
|
"loss": 0.8215, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 13.69622475856014, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 7.04876251164666e-06, |
|
"loss": 0.8181, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 13.766461808604038, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.013660195188553e-06, |
|
"loss": 0.8099, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 13.836698858647937, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.978439014940584e-06, |
|
"loss": 0.8278, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 13.906935908691835, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.943101049964042e-06, |
|
"loss": 0.8299, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 13.977172958735734, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.907648386213875e-06, |
|
"loss": 0.8003, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 14.04741000877963, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 6.872083116415547e-06, |
|
"loss": 0.7669, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 14.04741000877963, |
|
"eval_loss": 1.4163318872451782, |
|
"eval_runtime": 28.9493, |
|
"eval_samples_per_second": 8.774, |
|
"eval_steps_per_second": 8.774, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 14.117647058823529, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 6.836407339941522e-06, |
|
"loss": 0.7701, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 14.187884108867427, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 6.800623162687325e-06, |
|
"loss": 0.7634, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 14.258121158911326, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 6.764732696947243e-06, |
|
"loss": 0.7579, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 14.328358208955224, |
|
"grad_norm": 2.5, |
|
"learning_rate": 6.728738061289634e-06, |
|
"loss": 0.7713, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 14.398595258999123, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.692641380431879e-06, |
|
"loss": 0.7512, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 14.46883230904302, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.6564447851149505e-06, |
|
"loss": 0.7673, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 14.539069359086918, |
|
"grad_norm": 2.0, |
|
"learning_rate": 6.620150411977648e-06, |
|
"loss": 0.747, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 14.609306409130816, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 6.5837604034304715e-06, |
|
"loss": 0.7471, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 14.679543459174715, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.547276907529152e-06, |
|
"loss": 0.7733, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 14.749780509218613, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 6.510702077847864e-06, |
|
"loss": 0.7356, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 14.82001755926251, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 6.474038073352098e-06, |
|
"loss": 0.7702, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 14.890254609306409, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 6.4372870582712196e-06, |
|
"loss": 0.76, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 14.960491659350307, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 6.4004512019707144e-06, |
|
"loss": 0.7664, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 15.030728709394205, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 6.363532678824145e-06, |
|
"loss": 0.7274, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 15.100965759438104, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 6.326533668084783e-06, |
|
"loss": 0.703, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 15.171202809482002, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 6.289456353756988e-06, |
|
"loss": 0.7176, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 15.241439859525899, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.252302924467276e-06, |
|
"loss": 0.6589, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 15.311676909569798, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.2150755733351305e-06, |
|
"loss": 0.6822, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 15.381913959613696, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 6.177776497843552e-06, |
|
"loss": 0.6947, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 15.452151009657594, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.140407899709333e-06, |
|
"loss": 0.7027, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 15.522388059701493, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 6.102971984753104e-06, |
|
"loss": 0.6625, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 15.592625109745391, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 6.065470962769119e-06, |
|
"loss": 0.7181, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 15.662862159789288, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 6.027907047394812e-06, |
|
"loss": 0.6745, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 15.733099209833187, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 5.990282455980145e-06, |
|
"loss": 0.6744, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 15.803336259877085, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 5.952599409456697e-06, |
|
"loss": 0.6877, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 15.873573309920983, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 5.914860132206584e-06, |
|
"loss": 0.7303, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 15.943810359964882, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 5.877066851931151e-06, |
|
"loss": 0.712, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 16.01404741000878, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 5.83922179951947e-06, |
|
"loss": 0.6674, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 16.08428446005268, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.8013272089166526e-06, |
|
"loss": 0.6726, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 16.154521510096576, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.763385316991995e-06, |
|
"loss": 0.6198, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 16.224758560140476, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 5.725398363406922e-06, |
|
"loss": 0.6223, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 16.294995610184372, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 5.687368590482797e-06, |
|
"loss": 0.6315, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 16.36523266022827, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.64929824306855e-06, |
|
"loss": 0.6418, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 16.43546971027217, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.611189568408173e-06, |
|
"loss": 0.6086, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 16.505706760316066, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 5.573044816008066e-06, |
|
"loss": 0.6375, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 16.575943810359966, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 5.534866237504252e-06, |
|
"loss": 0.6085, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 16.646180860403863, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.496656086529467e-06, |
|
"loss": 0.6355, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 16.71641791044776, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.458416618580126e-06, |
|
"loss": 0.6206, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 16.78665496049166, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 5.420150090883191e-06, |
|
"loss": 0.6445, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 16.856892010535557, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 5.381858762262927e-06, |
|
"loss": 0.6508, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 16.927129060579457, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 5.343544893007563e-06, |
|
"loss": 0.6198, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 16.997366110623354, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 5.305210744735874e-06, |
|
"loss": 0.6156, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 17.06760316066725, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.266858580263678e-06, |
|
"loss": 0.5986, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 17.13784021071115, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 5.228490663470271e-06, |
|
"loss": 0.5637, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 17.208077260755047, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 5.190109259164782e-06, |
|
"loss": 0.5738, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 17.278314310798947, |
|
"grad_norm": 1.5, |
|
"learning_rate": 5.151716632952495e-06, |
|
"loss": 0.5912, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 17.348551360842844, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 5.113315051101111e-06, |
|
"loss": 0.5782, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 17.418788410886744, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 5.074906780406962e-06, |
|
"loss": 0.5548, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 17.48902546093064, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.036494088061222e-06, |
|
"loss": 0.5678, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 17.559262510974538, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.998079241516068e-06, |
|
"loss": 0.5768, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 17.629499561018438, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.959664508350834e-06, |
|
"loss": 0.5794, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 17.699736611062335, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.921252156138163e-06, |
|
"loss": 0.5754, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 17.769973661106235, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.882844452310155e-06, |
|
"loss": 0.5781, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 17.84021071115013, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.844443664024517e-06, |
|
"loss": 0.5834, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 17.91044776119403, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.8060520580307456e-06, |
|
"loss": 0.565, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 17.98068481123793, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.767671900536315e-06, |
|
"loss": 0.6071, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 18.050921861281825, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.729305457072913e-06, |
|
"loss": 0.5734, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 18.121158911325725, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.690954992362699e-06, |
|
"loss": 0.5322, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 18.191395961369622, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.652622770184637e-06, |
|
"loss": 0.5304, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 18.261633011413522, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.6143110532408455e-06, |
|
"loss": 0.5368, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 18.33187006145742, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.576022103023053e-06, |
|
"loss": 0.5456, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 18.402107111501316, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.537758179679098e-06, |
|
"loss": 0.5699, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 18.472344161545216, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.499521541879508e-06, |
|
"loss": 0.5587, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 18.542581211589113, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.461314446684189e-06, |
|
"loss": 0.526, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 18.612818261633013, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.423139149409176e-06, |
|
"loss": 0.5593, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 18.68305531167691, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.384997903493519e-06, |
|
"loss": 0.5379, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 18.753292361720806, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.346892960366255e-06, |
|
"loss": 0.5503, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 18.823529411764707, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.30882656931352e-06, |
|
"loss": 0.5275, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 18.893766461808603, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.270800977345767e-06, |
|
"loss": 0.5515, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 18.964003511852503, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.232818429065128e-06, |
|
"loss": 0.5484, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 19.0342405618964, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.194881166532923e-06, |
|
"loss": 0.5451, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 19.104477611940297, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.156991429137317e-06, |
|
"loss": 0.5222, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 19.174714661984197, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.119151453461121e-06, |
|
"loss": 0.5167, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 19.244951712028094, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.081363473149778e-06, |
|
"loss": 0.5009, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 19.315188762071994, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.0436297187795085e-06, |
|
"loss": 0.5206, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 19.38542581211589, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.005952417725649e-06, |
|
"loss": 0.5241, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 19.45566286215979, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 3.968333794031165e-06, |
|
"loss": 0.5247, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 19.525899912203688, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.930776068275375e-06, |
|
"loss": 0.5138, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 19.596136962247584, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 3.89328145744287e-06, |
|
"loss": 0.5295, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 19.666374012291485, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.8558521747926434e-06, |
|
"loss": 0.5235, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 19.73661106233538, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.818490429727455e-06, |
|
"loss": 0.5198, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 19.80684811237928, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.7811984276634024e-06, |
|
"loss": 0.5037, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 19.877085162423178, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.743978369899748e-06, |
|
"loss": 0.5069, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 19.947322212467075, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.70683245348897e-06, |
|
"loss": 0.5333, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 20.017559262510975, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.6697628711070786e-06, |
|
"loss": 0.4939, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 20.08779631255487, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.632771810924184e-06, |
|
"loss": 0.4879, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 20.158033362598772, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.5958614564753313e-06, |
|
"loss": 0.518, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 20.22827041264267, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.559033986531608e-06, |
|
"loss": 0.4944, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 20.298507462686565, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.522291574971538e-06, |
|
"loss": 0.5026, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 20.368744512730466, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.4856363906527513e-06, |
|
"loss": 0.5134, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 20.438981562774362, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.449070597283972e-06, |
|
"loss": 0.5056, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 20.509218612818263, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.4125963532972878e-06, |
|
"loss": 0.5035, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 20.57945566286216, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.376215811720744e-06, |
|
"loss": 0.5134, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 20.64969271290606, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.3399311200512495e-06, |
|
"loss": 0.4666, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 20.719929762949956, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.3037444201278202e-06, |
|
"loss": 0.4965, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 20.790166812993853, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.267657848005139e-06, |
|
"loss": 0.4953, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 20.860403863037753, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.2316735338274795e-06, |
|
"loss": 0.4914, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 20.93064091308165, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.1957936017029513e-06, |
|
"loss": 0.5002, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 21.00087796312555, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.1600201695781335e-06, |
|
"loss": 0.5163, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 21.071115013169447, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.124355349113037e-06, |
|
"loss": 0.4863, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 21.141352063213343, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.0888012455564707e-06, |
|
"loss": 0.4616, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 21.211589113257244, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 3.0533599576217664e-06, |
|
"loss": 0.4897, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 21.28182616330114, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.0180335773628912e-06, |
|
"loss": 0.4801, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 21.35206321334504, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 2.982824190050958e-06, |
|
"loss": 0.5161, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 21.422300263388937, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.94773387405114e-06, |
|
"loss": 0.4854, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 21.492537313432837, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.912764700699978e-06, |
|
"loss": 0.5062, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 21.562774363476734, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.8779187341831205e-06, |
|
"loss": 0.4976, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 21.63301141352063, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.843198031413473e-06, |
|
"loss": 0.4782, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 21.70324846356453, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.808604641909781e-06, |
|
"loss": 0.4694, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 21.773485513608428, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.7741406076756484e-06, |
|
"loss": 0.4864, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 21.843722563652328, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.7398079630790064e-06, |
|
"loss": 0.4775, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 21.913959613696225, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.7056087347320238e-06, |
|
"loss": 0.4923, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 21.98419666374012, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.6715449413714778e-06, |
|
"loss": 0.4862, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 22.05443371378402, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.637618593739588e-06, |
|
"loss": 0.4765, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 22.12467076382792, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.603831694465333e-06, |
|
"loss": 0.4581, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 22.19490781387182, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.57018623794623e-06, |
|
"loss": 0.479, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 22.265144863915715, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.5366842102306144e-06, |
|
"loss": 0.4777, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 22.335381913959615, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.503327588900396e-06, |
|
"loss": 0.4865, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 22.405618964003512, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.4701183429543386e-06, |
|
"loss": 0.4691, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 22.47585601404741, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 2.437058432691819e-06, |
|
"loss": 0.4755, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 22.54609306409131, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 2.4041498095971253e-06, |
|
"loss": 0.46, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 22.616330114135206, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 2.3713944162242506e-06, |
|
"loss": 0.4886, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 22.686567164179106, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.3387941860822395e-06, |
|
"loss": 0.488, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 22.756804214223003, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.3063510435210456e-06, |
|
"loss": 0.4788, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 22.8270412642669, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 2.2740669036179464e-06, |
|
"loss": 0.4856, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 22.8972783143108, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.24194367206449e-06, |
|
"loss": 0.4672, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 22.967515364354696, |
|
"grad_norm": 9.875, |
|
"learning_rate": 2.209983245054014e-06, |
|
"loss": 0.4663, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 23.037752414398597, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 2.178187509169713e-06, |
|
"loss": 0.465, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 23.107989464442493, |
|
"grad_norm": 10.375, |
|
"learning_rate": 2.146558341273273e-06, |
|
"loss": 0.4592, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 23.17822651448639, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.115097608394084e-06, |
|
"loss": 0.4521, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 23.24846356453029, |
|
"grad_norm": 3.0, |
|
"learning_rate": 2.083807167619029e-06, |
|
"loss": 0.4396, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 23.318700614574187, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 2.0526888659828716e-06, |
|
"loss": 0.4557, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 23.388937664618087, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 2.0217445403592185e-06, |
|
"loss": 0.4686, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 23.459174714661984, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.990976017352097e-06, |
|
"loss": 0.4696, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.9603851131881256e-06, |
|
"loss": 0.4566, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 23.59964881474978, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.9299736336093137e-06, |
|
"loss": 0.4642, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 23.669885864793677, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.8997433737664673e-06, |
|
"loss": 0.4371, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 23.740122914837578, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.869696118113216e-06, |
|
"loss": 0.4616, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 23.810359964881474, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.8398336403006956e-06, |
|
"loss": 0.4551, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 23.880597014925375, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.8101577030728324e-06, |
|
"loss": 0.4456, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 23.95083406496927, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.7806700581623059e-06, |
|
"loss": 0.4756, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 24.021071115013168, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.7513724461871423e-06, |
|
"loss": 0.4519, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 24.091308165057068, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.7222665965479585e-06, |
|
"loss": 0.4489, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 24.161545215100965, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.6933542273258924e-06, |
|
"loss": 0.4369, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 24.231782265144865, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.6646370451811784e-06, |
|
"loss": 0.4246, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 24.302019315188762, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.6361167452524073e-06, |
|
"loss": 0.4181, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 24.37225636523266, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.6077950110564606e-06, |
|
"loss": 0.4349, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 24.44249341527656, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.5796735143891423e-06, |
|
"loss": 0.4358, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 24.512730465320455, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.551753915226491e-06, |
|
"loss": 0.4428, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 24.582967515364356, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.5240378616267887e-06, |
|
"loss": 0.4538, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 24.653204565408252, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.4965269896332884e-06, |
|
"loss": 0.4311, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 24.723441615452153, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.46922292317763e-06, |
|
"loss": 0.4664, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 24.79367866549605, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.4421272739839898e-06, |
|
"loss": 0.4315, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 24.863915715539946, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.4152416414739401e-06, |
|
"loss": 0.4403, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 24.934152765583846, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.3885676126720315e-06, |
|
"loss": 0.4652, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 25.004389815627743, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.362106762112123e-06, |
|
"loss": 0.4439, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 25.074626865671643, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.3358606517444328e-06, |
|
"loss": 0.443, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 25.14486391571554, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.3098308308433411e-06, |
|
"loss": 0.416, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 25.215100965759436, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.2840188359159329e-06, |
|
"loss": 0.4035, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 25.285338015803337, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.258426190611306e-06, |
|
"loss": 0.4279, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 25.355575065847233, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.2330544056306315e-06, |
|
"loss": 0.4358, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 25.425812115891134, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.2079049786379782e-06, |
|
"loss": 0.4368, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 25.49604916593503, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.1829793941719053e-06, |
|
"loss": 0.4271, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 25.56628621597893, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.1582791235578321e-06, |
|
"loss": 0.4482, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 25.636523266022827, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.1338056248211916e-06, |
|
"loss": 0.4131, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 25.706760316066724, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.1095603426013613e-06, |
|
"loss": 0.4513, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 25.776997366110624, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.0855447080663907e-06, |
|
"loss": 0.4118, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 25.84723441615452, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.0617601388285149e-06, |
|
"loss": 0.4082, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 25.91747146619842, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.0382080388604866e-06, |
|
"loss": 0.428, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 25.987708516242318, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.0148897984126876e-06, |
|
"loss": 0.452, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 26.057945566286215, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.918067939310766e-07, |
|
"loss": 0.4177, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 26.128182616330115, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.689603879759284e-07, |
|
"loss": 0.4091, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 26.19841966637401, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 9.463519291414131e-07, |
|
"loss": 0.4035, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 26.26865671641791, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 9.239827519759842e-07, |
|
"loss": 0.41, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 26.33889376646181, |
|
"grad_norm": 2.25, |
|
"learning_rate": 9.018541769036054e-07, |
|
"loss": 0.4306, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 26.409130816505705, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 8.799675101458033e-07, |
|
"loss": 0.4122, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 26.479367866549605, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 8.583240436445666e-07, |
|
"loss": 0.3986, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 26.549604916593502, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.369250549860869e-07, |
|
"loss": 0.437, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 26.619841966637402, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.157718073253351e-07, |
|
"loss": 0.4344, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 26.6900790166813, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 7.948655493115098e-07, |
|
"loss": 0.4269, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 26.7603160667252, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 7.742075150143225e-07, |
|
"loss": 0.3962, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 26.830553116769096, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 7.537989238511578e-07, |
|
"loss": 0.4376, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 26.900790166812993, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.336409805150901e-07, |
|
"loss": 0.4504, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 26.971027216856893, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 7.137348749037748e-07, |
|
"loss": 0.4166, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 27.04126426690079, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.940817820492024e-07, |
|
"loss": 0.4244, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 27.11150131694469, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 6.746828620483487e-07, |
|
"loss": 0.4096, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 27.181738366988586, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.555392599946903e-07, |
|
"loss": 0.3896, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 27.251975417032483, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 6.366521059106078e-07, |
|
"loss": 0.4363, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 27.322212467076383, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 6.180225146806878e-07, |
|
"loss": 0.4183, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 27.39244951712028, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 5.996515859859109e-07, |
|
"loss": 0.3911, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 27.46268656716418, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 5.815404042387379e-07, |
|
"loss": 0.4237, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 27.532923617208077, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.636900385191014e-07, |
|
"loss": 0.4077, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 27.603160667251977, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 5.461015425112915e-07, |
|
"loss": 0.407, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 27.673397717295874, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.287759544417687e-07, |
|
"loss": 0.436, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 27.74363476733977, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 5.117142970178712e-07, |
|
"loss": 0.4014, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 27.81387181738367, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 4.949175773674502e-07, |
|
"loss": 0.4272, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 27.884108867427567, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.783867869794157e-07, |
|
"loss": 0.4179, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 27.954345917471468, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.6212290164521554e-07, |
|
"loss": 0.4351, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 28.024582967515364, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.461268814012304e-07, |
|
"loss": 0.3982, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 28.09482001755926, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 4.3039967047210865e-07, |
|
"loss": 0.4249, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 28.09482001755926, |
|
"eval_loss": 1.6928666830062866, |
|
"eval_runtime": 10.5811, |
|
"eval_samples_per_second": 24.005, |
|
"eval_steps_per_second": 24.005, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 28.16505706760316, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.1494219721502917e-07, |
|
"loss": 0.406, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 28.235294117647058, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.997553740648974e-07, |
|
"loss": 0.4208, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 28.30553116769096, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.8484009748049053e-07, |
|
"loss": 0.4113, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 28.375768217734855, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.7019724789154e-07, |
|
"loss": 0.4131, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 28.44600526777875, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.558276896467555e-07, |
|
"loss": 0.4192, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 28.516242317822652, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.4173227096281124e-07, |
|
"loss": 0.3869, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 28.58647936786655, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 3.279118238742729e-07, |
|
"loss": 0.4353, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 28.65671641791045, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.143671641844831e-07, |
|
"loss": 0.3981, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 28.726953467954345, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 3.0109909141740614e-07, |
|
"loss": 0.3976, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 28.797190517998246, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.881083887704339e-07, |
|
"loss": 0.4075, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 28.867427568042142, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.753958230681547e-07, |
|
"loss": 0.4224, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 28.93766461808604, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.6296214471708826e-07, |
|
"loss": 0.4291, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 29.00790166812994, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.5080808766138996e-07, |
|
"loss": 0.4209, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 29.078138718173836, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 2.3893436933952575e-07, |
|
"loss": 0.426, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 29.148375768217736, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 2.2734169064192623e-07, |
|
"loss": 0.4265, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 29.218612818261633, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.1603073586961067e-07, |
|
"loss": 0.3948, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 29.28884986830553, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 2.0500217269379618e-07, |
|
"loss": 0.4023, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 29.35908691834943, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.9425665211648238e-07, |
|
"loss": 0.4033, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 29.429323968393327, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.837948084320268e-07, |
|
"loss": 0.4199, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 29.499561018437227, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.736172591897023e-07, |
|
"loss": 0.4347, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 29.569798068481123, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.6372460515724498e-07, |
|
"loss": 0.3949, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 29.64003511852502, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.541174302853876e-07, |
|
"loss": 0.4254, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 29.71027216856892, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.4479630167339554e-07, |
|
"loss": 0.3997, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 29.780509218612817, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.3576176953558783e-07, |
|
"loss": 0.3875, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 29.850746268656717, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.2701436716885897e-07, |
|
"loss": 0.4052, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 29.920983318700614, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.1855461092119991e-07, |
|
"loss": 0.4227, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 29.991220368744514, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.1038300016121883e-07, |
|
"loss": 0.4192, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 30.06145741878841, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.025000172486651e-07, |
|
"loss": 0.4067, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 30.131694468832308, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 9.490612750595096e-08, |
|
"loss": 0.4177, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 30.201931518876208, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 8.760177919069302e-08, |
|
"loss": 0.4126, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 30.272168568920105, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 8.058740346924221e-08, |
|
"loss": 0.4098, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 30.342405618964005, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 7.386341439124145e-08, |
|
"loss": 0.4183, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 30.4126426690079, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 6.74302088651796e-08, |
|
"loss": 0.4249, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 30.482879719051798, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 6.128816663496296e-08, |
|
"loss": 0.4003, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 30.5531167690957, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 5.543765025750103e-08, |
|
"loss": 0.4085, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 30.623353819139595, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.987900508130417e-08, |
|
"loss": 0.437, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 30.693590869183495, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.461255922609986e-08, |
|
"loss": 0.4117, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 30.763827919227392, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.963862356346049e-08, |
|
"loss": 0.4054, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 30.834064969271292, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.49574916984563e-08, |
|
"loss": 0.3948, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 30.90430201931519, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 3.056943995232431e-08, |
|
"loss": 0.3932, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 30.974539069359086, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 2.6474727346155194e-08, |
|
"loss": 0.426, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 31.044776119402986, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 2.2673595585605557e-08, |
|
"loss": 0.4219, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 31.115013169446883, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 1.9166269046628215e-08, |
|
"loss": 0.426, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 31.185250219490783, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 1.5952954762230575e-08, |
|
"loss": 0.4222, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 31.25548726953468, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 1.3033842410251074e-08, |
|
"loss": 0.4178, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 31.325724319578576, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.0409104302164241e-08, |
|
"loss": 0.4068, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 31.395961369622476, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 8.078895372908846e-09, |
|
"loss": 0.4126, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 31.466198419666373, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 6.0433531717424275e-09, |
|
"loss": 0.4136, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 31.536435469710273, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.302597854121127e-09, |
|
"loss": 0.4096, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 31.60667251975417, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.8567321746092446e-09, |
|
"loss": 0.4092, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 31.67690956979807, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 1.705841480810766e-09, |
|
"loss": 0.4072, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 31.747146619841967, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 8.499937083339404e-10, |
|
"loss": 0.4261, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 31.817383669885864, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 2.892393767800483e-10, |
|
"loss": 0.4138, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 31.887620719929764, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.3611586760785566e-11, |
|
"loss": 0.3897, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 31.91571553994732, |
|
"step": 4544, |
|
"total_flos": 9.547395325030564e+17, |
|
"train_loss": 0.8426483250682203, |
|
"train_runtime": 5512.317, |
|
"train_samples_per_second": 13.218, |
|
"train_steps_per_second": 0.824 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4544, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 32, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.547395325030564e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|