|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9994825355756791, |
|
"eval_steps": -1932, |
|
"global_step": 1932, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 17.643007937527535, |
|
"learning_rate": 1.9999986779270796e-05, |
|
"loss": 1.7409, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.9197503449441455, |
|
"learning_rate": 1.9998677955919127e-05, |
|
"loss": 1.4484, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7374808701002429, |
|
"learning_rate": 1.9994712173236604e-05, |
|
"loss": 1.3475, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6899015443647589, |
|
"learning_rate": 1.9988103700540345e-05, |
|
"loss": 1.2921, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7237341780111256, |
|
"learning_rate": 1.9978854285168784e-05, |
|
"loss": 1.2894, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5922734965135668, |
|
"learning_rate": 1.99669663727489e-05, |
|
"loss": 1.268, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5581439767924481, |
|
"learning_rate": 1.9952443106549535e-05, |
|
"loss": 1.2615, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5940665797953831, |
|
"learning_rate": 1.9935288326650314e-05, |
|
"loss": 1.2646, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5959744098368311, |
|
"learning_rate": 1.9915506568926283e-05, |
|
"loss": 1.2585, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8144997890397803, |
|
"learning_rate": 1.989310306384858e-05, |
|
"loss": 1.2584, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.777903617079376, |
|
"learning_rate": 1.9868083735101464e-05, |
|
"loss": 1.2505, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6440368370519068, |
|
"learning_rate": 1.9840455198016033e-05, |
|
"loss": 1.242, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.9925170475214626, |
|
"learning_rate": 1.9810224757821063e-05, |
|
"loss": 1.2588, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8103038992058391, |
|
"learning_rate": 1.9777400407711467e-05, |
|
"loss": 1.2409, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5742872095116479, |
|
"learning_rate": 1.9741990826734793e-05, |
|
"loss": 1.2399, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5826582249622616, |
|
"learning_rate": 1.9704005377496428e-05, |
|
"loss": 1.2436, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.001834033758104, |
|
"learning_rate": 1.9663454103684043e-05, |
|
"loss": 1.2385, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6116318375090269, |
|
"learning_rate": 1.9620347727411933e-05, |
|
"loss": 1.2297, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7383765635051086, |
|
"learning_rate": 1.9574697646386027e-05, |
|
"loss": 1.2355, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7300866971988723, |
|
"learning_rate": 1.9526515930890203e-05, |
|
"loss": 1.2359, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.616652810626178, |
|
"learning_rate": 1.947581532059481e-05, |
|
"loss": 1.2279, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7043435155790823, |
|
"learning_rate": 1.9422609221188208e-05, |
|
"loss": 1.2376, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.582186147373243, |
|
"learning_rate": 1.9366911700832146e-05, |
|
"loss": 1.2067, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5325118176004562, |
|
"learning_rate": 1.9308737486442045e-05, |
|
"loss": 1.2211, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.2786931118214846, |
|
"learning_rate": 1.9248101959793066e-05, |
|
"loss": 1.2308, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8059420699070421, |
|
"learning_rate": 1.918502115345303e-05, |
|
"loss": 1.2275, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.0151484537313564, |
|
"learning_rate": 1.9119511746543265e-05, |
|
"loss": 1.2194, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6394385159074882, |
|
"learning_rate": 1.9051591060328496e-05, |
|
"loss": 1.2199, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.1135335849941765, |
|
"learning_rate": 1.8981277053636963e-05, |
|
"loss": 1.2035, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2945894820873338, |
|
"learning_rate": 1.8908588318111932e-05, |
|
"loss": 1.2067, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7098012376621641, |
|
"learning_rate": 1.8833544073295918e-05, |
|
"loss": 1.2095, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.053415846088335, |
|
"learning_rate": 1.8756164161548848e-05, |
|
"loss": 1.2135, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5430734273349764, |
|
"learning_rate": 1.867646904280159e-05, |
|
"loss": 1.2263, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7828549270150109, |
|
"learning_rate": 1.859447978914614e-05, |
|
"loss": 1.2164, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5580407435562567, |
|
"learning_rate": 1.8510218079263995e-05, |
|
"loss": 1.2215, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.604581236096349, |
|
"learning_rate": 1.8423706192694118e-05, |
|
"loss": 1.2056, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6909532249212962, |
|
"learning_rate": 1.833496700394202e-05, |
|
"loss": 1.2167, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3577994836194605, |
|
"learning_rate": 1.824402397643155e-05, |
|
"loss": 1.2158, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.9441385325830896, |
|
"learning_rate": 1.8150901156300956e-05, |
|
"loss": 1.2112, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7267919462491368, |
|
"learning_rate": 1.8055623166044855e-05, |
|
"loss": 1.2075, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.072921001675868, |
|
"learning_rate": 1.7958215198003866e-05, |
|
"loss": 1.2086, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6898746674574788, |
|
"learning_rate": 1.78587030077035e-05, |
|
"loss": 1.2055, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.9590944345354587, |
|
"learning_rate": 1.77571129070442e-05, |
|
"loss": 1.2133, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.025412064694876, |
|
"learning_rate": 1.7653471757344203e-05, |
|
"loss": 1.2111, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7709425585870252, |
|
"learning_rate": 1.7547806962237222e-05, |
|
"loss": 1.2225, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9387702182387361, |
|
"learning_rate": 1.744014646042663e-05, |
|
"loss": 1.2118, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3353541510148834, |
|
"learning_rate": 1.7330518718298263e-05, |
|
"loss": 1.2027, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.727639267424137, |
|
"learning_rate": 1.7218952722393646e-05, |
|
"loss": 1.2006, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5674850292979883, |
|
"learning_rate": 1.7105477971745668e-05, |
|
"loss": 1.208, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7282760631852704, |
|
"learning_rate": 1.699012447007882e-05, |
|
"loss": 1.1933, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5061673744005348, |
|
"learning_rate": 1.6872922717875923e-05, |
|
"loss": 1.1954, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.129316814510312, |
|
"learning_rate": 1.6753903704313527e-05, |
|
"loss": 1.1948, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.562747770369411, |
|
"learning_rate": 1.6633098899068112e-05, |
|
"loss": 1.2116, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4038904472553382, |
|
"learning_rate": 1.6510540243995216e-05, |
|
"loss": 1.2069, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.9891488942684802, |
|
"learning_rate": 1.6386260144683744e-05, |
|
"loss": 1.1841, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.674516959054479, |
|
"learning_rate": 1.6260291461887628e-05, |
|
"loss": 1.2092, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8101178930200521, |
|
"learning_rate": 1.6132667502837164e-05, |
|
"loss": 1.1898, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6229432949885818, |
|
"learning_rate": 1.6003422012432275e-05, |
|
"loss": 1.2033, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7465954888391386, |
|
"learning_rate": 1.587258916432008e-05, |
|
"loss": 1.1889, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5512571394145237, |
|
"learning_rate": 1.574020355185906e-05, |
|
"loss": 1.2108, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6230914377601484, |
|
"learning_rate": 1.560630017897229e-05, |
|
"loss": 1.1947, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.842968187484418, |
|
"learning_rate": 1.5470914450892066e-05, |
|
"loss": 1.1959, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5710889771223773, |
|
"learning_rate": 1.533408216479849e-05, |
|
"loss": 1.209, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6384265788966181, |
|
"learning_rate": 1.5195839500354337e-05, |
|
"loss": 1.2034, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1199276334985704, |
|
"learning_rate": 1.5056223010138857e-05, |
|
"loss": 1.1945, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.6533817452915996, |
|
"learning_rate": 1.491526960998295e-05, |
|
"loss": 1.2137, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8685960014206779, |
|
"learning_rate": 1.4773016569208283e-05, |
|
"loss": 1.2111, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5953668304650684, |
|
"learning_rate": 1.4629501500772962e-05, |
|
"loss": 1.199, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6749225480684463, |
|
"learning_rate": 1.4484762351326344e-05, |
|
"loss": 1.2084, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.904182741829073, |
|
"learning_rate": 1.4338837391175582e-05, |
|
"loss": 1.1945, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5508295579907717, |
|
"learning_rate": 1.4191765204166643e-05, |
|
"loss": 1.1939, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6866787996093148, |
|
"learning_rate": 1.4043584677482383e-05, |
|
"loss": 1.1909, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8835327853377292, |
|
"learning_rate": 1.3894334991360448e-05, |
|
"loss": 1.1948, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.8315517331576922, |
|
"learning_rate": 1.3744055608733654e-05, |
|
"loss": 1.1808, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8656141990305095, |
|
"learning_rate": 1.3592786264795659e-05, |
|
"loss": 1.1887, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0324451071242036, |
|
"learning_rate": 1.344056695649462e-05, |
|
"loss": 1.1884, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.3534158770353415, |
|
"learning_rate": 1.3287437931957642e-05, |
|
"loss": 1.1992, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7244302821509494, |
|
"learning_rate": 1.3133439679848824e-05, |
|
"loss": 1.1873, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.7298798990304196, |
|
"learning_rate": 1.2978612918663702e-05, |
|
"loss": 1.1884, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.413179744039514, |
|
"learning_rate": 1.2822998585962909e-05, |
|
"loss": 1.1909, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.0358455016123018, |
|
"learning_rate": 1.2666637827547935e-05, |
|
"loss": 1.1987, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.6031288230702698, |
|
"learning_rate": 1.2509571986581814e-05, |
|
"loss": 1.1902, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7093558444620996, |
|
"learning_rate": 1.2351842592657612e-05, |
|
"loss": 1.199, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5974126496122436, |
|
"learning_rate": 1.2193491350817657e-05, |
|
"loss": 1.1928, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0993091833167254, |
|
"learning_rate": 1.2034560130526341e-05, |
|
"loss": 1.1894, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0436299996205698, |
|
"learning_rate": 1.1875090954599472e-05, |
|
"loss": 1.1845, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.005963688677739, |
|
"learning_rate": 1.1715125988093075e-05, |
|
"loss": 1.1881, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.8352354026650106, |
|
"learning_rate": 1.155470752715458e-05, |
|
"loss": 1.1871, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7899077951054885, |
|
"learning_rate": 1.1393877987839329e-05, |
|
"loss": 1.1903, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5922501845943217, |
|
"learning_rate": 1.1232679894895417e-05, |
|
"loss": 1.1826, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.048795193828536, |
|
"learning_rate": 1.1071155870519777e-05, |
|
"loss": 1.1782, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7362075773775668, |
|
"learning_rate": 1.0909348623088472e-05, |
|
"loss": 1.2019, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.6013600464653039, |
|
"learning_rate": 1.0747300935864245e-05, |
|
"loss": 1.18, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5456484078106916, |
|
"learning_rate": 1.058505565568424e-05, |
|
"loss": 1.1897, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.5478758461886337, |
|
"learning_rate": 1.0422655681630917e-05, |
|
"loss": 1.1842, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.589047830040976, |
|
"learning_rate": 1.0260143953689165e-05, |
|
"loss": 1.1668, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.8101184986963358, |
|
"learning_rate": 1.0097563441392582e-05, |
|
"loss": 1.1841, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.1935690641403198, |
|
"eval_runtime": 713.5459, |
|
"eval_samples_per_second": 19.182, |
|
"eval_steps_per_second": 2.398, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.7266564378737855, |
|
"learning_rate": 9.93495713246196e-06, |
|
"loss": 1.158, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.5811606746061891, |
|
"learning_rate": 9.772368021438943e-06, |
|
"loss": 1.137, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.9020508561679803, |
|
"learning_rate": 9.609839098317902e-06, |
|
"loss": 1.1431, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.18508535225423, |
|
"learning_rate": 9.447413337178994e-06, |
|
"loss": 1.1321, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.8495084821410501, |
|
"learning_rate": 9.285133684825435e-06, |
|
"loss": 1.1348, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.552216080899871, |
|
"learning_rate": 9.123043049427996e-06, |
|
"loss": 1.1334, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.9043855104169688, |
|
"learning_rate": 8.961184289179695e-06, |
|
"loss": 1.118, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.7622936476724878, |
|
"learning_rate": 8.799600200963716e-06, |
|
"loss": 1.1352, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.8035628121141856, |
|
"learning_rate": 8.638333509037537e-06, |
|
"loss": 1.1313, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.8605002053318102, |
|
"learning_rate": 8.477426853736257e-06, |
|
"loss": 1.122, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.8384089217871873, |
|
"learning_rate": 8.316922780198126e-06, |
|
"loss": 1.1298, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.5528220558144542, |
|
"learning_rate": 8.15686372711521e-06, |
|
"loss": 1.1235, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.6970307225205526, |
|
"learning_rate": 7.997292015512257e-06, |
|
"loss": 1.1321, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.7347197055541482, |
|
"learning_rate": 7.83824983755663e-06, |
|
"loss": 1.1294, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.5059154997147504, |
|
"learning_rate": 7.679779245402321e-06, |
|
"loss": 1.1094, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.884265369908016, |
|
"learning_rate": 7.521922140071003e-06, |
|
"loss": 1.1241, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.6033143868426079, |
|
"learning_rate": 7.364720260373017e-06, |
|
"loss": 1.1268, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0148548766319452, |
|
"learning_rate": 7.208215171871277e-06, |
|
"loss": 1.1285, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.618997247973023, |
|
"learning_rate": 7.052448255890958e-06, |
|
"loss": 1.1298, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.8657773774435762, |
|
"learning_rate": 6.897460698577918e-06, |
|
"loss": 1.1207, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.7115128067386574, |
|
"learning_rate": 6.743293480008703e-06, |
|
"loss": 1.1309, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.7183269888324105, |
|
"learning_rate": 6.589987363355068e-06, |
|
"loss": 1.1218, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.5703913996302657, |
|
"learning_rate": 6.437582884105835e-06, |
|
"loss": 1.1289, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.7815346714791873, |
|
"learning_rate": 6.286120339348935e-06, |
|
"loss": 1.1388, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.5219064361079887, |
|
"learning_rate": 6.135639777116526e-06, |
|
"loss": 1.1321, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.6957441921997511, |
|
"learning_rate": 5.986180985795927e-06, |
|
"loss": 1.1344, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.5281652316634469, |
|
"learning_rate": 5.837783483609214e-06, |
|
"loss": 1.1378, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.6339946135976855, |
|
"learning_rate": 5.690486508164268e-06, |
|
"loss": 1.1234, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.6527823541816111, |
|
"learning_rate": 5.544329006079987e-06, |
|
"loss": 1.1273, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.5301541402833422, |
|
"learning_rate": 5.399349622688479e-06, |
|
"loss": 1.1135, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.5178611686388369, |
|
"learning_rate": 5.255586691816874e-06, |
|
"loss": 1.1257, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.5477723564080109, |
|
"learning_rate": 5.113078225651529e-06, |
|
"loss": 1.1267, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.5638053300059095, |
|
"learning_rate": 4.971861904687283e-06, |
|
"loss": 1.1331, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.5721584413593953, |
|
"learning_rate": 4.831975067764387e-06, |
|
"loss": 1.1107, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.9007471246680614, |
|
"learning_rate": 4.693454702195784e-06, |
|
"loss": 1.121, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.7273372669798455, |
|
"learning_rate": 4.556337433987359e-06, |
|
"loss": 1.1275, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.488613514692357, |
|
"learning_rate": 4.420659518153667e-06, |
|
"loss": 1.1274, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.578443583469973, |
|
"learning_rate": 4.286456829131821e-06, |
|
"loss": 1.124, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.6326996663378455, |
|
"learning_rate": 4.153764851295954e-06, |
|
"loss": 1.1376, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.5965643189554584, |
|
"learning_rate": 4.022618669574839e-06, |
|
"loss": 1.1219, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.5659365610817053, |
|
"learning_rate": 3.893052960175128e-06, |
|
"loss": 1.1172, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.600973182084687, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 1.121, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.5982641636650878, |
|
"learning_rate": 3.6387995646542727e-06, |
|
"loss": 1.1287, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.5031147841111573, |
|
"learning_rate": 3.5141791053724405e-06, |
|
"loss": 1.1339, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.5426031516616974, |
|
"learning_rate": 3.3912735543152864e-06, |
|
"loss": 1.1288, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.5392907866642725, |
|
"learning_rate": 3.27011540879406e-06, |
|
"loss": 1.1271, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.5386770630525428, |
|
"learning_rate": 3.1507367040905943e-06, |
|
"loss": 1.1334, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.5085556091485328, |
|
"learning_rate": 3.0331690049868733e-06, |
|
"loss": 1.1297, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.5633636269166687, |
|
"learning_rate": 2.9174433974190365e-06, |
|
"loss": 1.1243, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.48764993266916146, |
|
"learning_rate": 2.803590480257985e-06, |
|
"loss": 1.1281, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.9169861978820425, |
|
"learning_rate": 2.691640357218759e-06, |
|
"loss": 1.1239, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.516479662678479, |
|
"learning_rate": 2.581622628900868e-06, |
|
"loss": 1.1228, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.6193601635552117, |
|
"learning_rate": 2.4735663849616098e-06, |
|
"loss": 1.1202, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.49498823489992116, |
|
"learning_rate": 2.367500196424529e-06, |
|
"loss": 1.1279, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.6290761419202775, |
|
"learning_rate": 2.263452108124968e-06, |
|
"loss": 1.1192, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.5701593566889765, |
|
"learning_rate": 2.161449631294785e-06, |
|
"loss": 1.1227, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.4691807510199917, |
|
"learning_rate": 2.0615197362881234e-06, |
|
"loss": 1.117, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.926057066254184, |
|
"learning_rate": 1.963688845450218e-06, |
|
"loss": 1.1177, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.7383468808391309, |
|
"learning_rate": 1.8679828261311073e-06, |
|
"loss": 1.1299, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.49356910651083413, |
|
"learning_rate": 1.774426983846058e-06, |
|
"loss": 1.1167, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.5403947993978422, |
|
"learning_rate": 1.6830460555845719e-06, |
|
"loss": 1.1324, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.5254533594635142, |
|
"learning_rate": 1.593864203269716e-06, |
|
"loss": 1.1288, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.4750626069678938, |
|
"learning_rate": 1.5069050073694813e-06, |
|
"loss": 1.127, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.4803355102824248, |
|
"learning_rate": 1.4221914606619135e-06, |
|
"loss": 1.1253, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.5023769646064918, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 1.1362, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.4750107387465944, |
|
"learning_rate": 1.259590311167238e-06, |
|
"loss": 1.1196, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.49120491276877826, |
|
"learning_rate": 1.181745701557574e-06, |
|
"loss": 1.1163, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.523266756507014, |
|
"learning_rate": 1.1062327161276965e-06, |
|
"loss": 1.1133, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.48201400455042975, |
|
"learning_rate": 1.0330713211766864e-06, |
|
"loss": 1.128, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.6814324481203446, |
|
"learning_rate": 9.622808612223722e-07, |
|
"loss": 1.1441, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.4529669002867237, |
|
"learning_rate": 8.9388005388647e-07, |
|
"loss": 1.132, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.5488831178907686, |
|
"learning_rate": 8.278869849454718e-07, |
|
"loss": 1.1373, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.46046446784548656, |
|
"learning_rate": 7.643191035486086e-07, |
|
"loss": 1.1147, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.48574840186097173, |
|
"learning_rate": 7.031932176041522e-07, |
|
"loss": 1.1283, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.614025528461565, |
|
"learning_rate": 6.445254893352381e-07, |
|
"loss": 1.1147, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.46356093239410984, |
|
"learning_rate": 5.883314310064492e-07, |
|
"loss": 1.1353, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.5695243032656638, |
|
"learning_rate": 5.346259008222243e-07, |
|
"loss": 1.1206, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.5116195400283284, |
|
"learning_rate": 4.834230989982214e-07, |
|
"loss": 1.1123, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.4727612452244113, |
|
"learning_rate": 4.3473656400665256e-07, |
|
"loss": 1.1258, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.0498570469876018, |
|
"learning_rate": 3.885791689966023e-07, |
|
"loss": 1.129, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.5474571870293417, |
|
"learning_rate": 3.4496311839024133e-07, |
|
"loss": 1.1246, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.46259219613163566, |
|
"learning_rate": 3.038999446558755e-07, |
|
"loss": 1.1249, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.5077623062630557, |
|
"learning_rate": 2.654005052586628e-07, |
|
"loss": 1.1344, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.7073531796619889, |
|
"learning_rate": 2.294749797897955e-07, |
|
"loss": 1.1075, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.48237877829197745, |
|
"learning_rate": 1.961328672749352e-07, |
|
"loss": 1.1219, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.456598123312511, |
|
"learning_rate": 1.6538298366257975e-07, |
|
"loss": 1.1109, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.4764383587717983, |
|
"learning_rate": 1.3723345949305245e-07, |
|
"loss": 1.1238, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.4611156020653585, |
|
"learning_rate": 1.1169173774871478e-07, |
|
"loss": 1.1229, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.4560163397865408, |
|
"learning_rate": 8.876457188597642e-08, |
|
"loss": 1.1325, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.5219948547290205, |
|
"learning_rate": 6.845802404962243e-08, |
|
"loss": 1.1177, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.4510998317003041, |
|
"learning_rate": 5.0777463469925406e-08, |
|
"loss": 1.125, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.48962561357722584, |
|
"learning_rate": 3.572756504297892e-08, |
|
"loss": 1.1399, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.5094047679112849, |
|
"learning_rate": 2.3312308094607382e-08, |
|
"loss": 1.1209, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.48754364841280406, |
|
"learning_rate": 1.3534975328205468e-08, |
|
"loss": 1.1311, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.48361438379969285, |
|
"learning_rate": 6.398151956754639e-09, |
|
"loss": 1.123, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.5170334599385726, |
|
"learning_rate": 1.9037250192732728e-09, |
|
"loss": 1.1228, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.8343836947757891, |
|
"learning_rate": 5.288288186688917e-11, |
|
"loss": 1.129, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.1860260963439941, |
|
"eval_runtime": 713.2766, |
|
"eval_samples_per_second": 19.189, |
|
"eval_steps_per_second": 2.399, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1932, |
|
"total_flos": 230258632556544.0, |
|
"train_loss": 1.1706056661487365, |
|
"train_runtime": 44868.6505, |
|
"train_samples_per_second": 5.513, |
|
"train_steps_per_second": 0.043 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1932, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 230258632556544.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|