gemma-2-2b-tulu-v2-mix / trainer_state.json
kykim0's picture
Model save
babb85a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999975471559273,
"eval_steps": 500,
"global_step": 7644,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003924550516323677,
"grad_norm": 17.851987080758406,
"learning_rate": 8.695652173913044e-07,
"loss": 1.6604,
"step": 10
},
{
"epoch": 0.007849101032647354,
"grad_norm": 4.080985759495849,
"learning_rate": 1.7391304347826088e-06,
"loss": 1.6397,
"step": 20
},
{
"epoch": 0.011773651548971031,
"grad_norm": 2.9426600727037857,
"learning_rate": 2.6086956521739132e-06,
"loss": 1.5109,
"step": 30
},
{
"epoch": 0.01569820206529471,
"grad_norm": 2.083885999160059,
"learning_rate": 3.4782608695652175e-06,
"loss": 1.4323,
"step": 40
},
{
"epoch": 0.019622752581618386,
"grad_norm": 5.06773100741999,
"learning_rate": 4.347826086956522e-06,
"loss": 1.4517,
"step": 50
},
{
"epoch": 0.023547303097942063,
"grad_norm": 3.058219108148106,
"learning_rate": 5.2173913043478265e-06,
"loss": 1.3684,
"step": 60
},
{
"epoch": 0.02747185361426574,
"grad_norm": 2.512374629159717,
"learning_rate": 6.086956521739132e-06,
"loss": 1.2584,
"step": 70
},
{
"epoch": 0.03139640413058942,
"grad_norm": 2.083781423381198,
"learning_rate": 6.956521739130435e-06,
"loss": 1.2888,
"step": 80
},
{
"epoch": 0.035320954646913094,
"grad_norm": 1.33754132363256,
"learning_rate": 7.82608695652174e-06,
"loss": 1.3061,
"step": 90
},
{
"epoch": 0.03924550516323677,
"grad_norm": 5.056402397307596,
"learning_rate": 8.695652173913044e-06,
"loss": 1.3281,
"step": 100
},
{
"epoch": 0.04317005567956045,
"grad_norm": 2.924060457678278,
"learning_rate": 9.565217391304349e-06,
"loss": 1.3235,
"step": 110
},
{
"epoch": 0.047094606195884126,
"grad_norm": 2.2704861146708177,
"learning_rate": 1.0434782608695653e-05,
"loss": 1.2073,
"step": 120
},
{
"epoch": 0.0510191567122078,
"grad_norm": 2.137074925911332,
"learning_rate": 1.1304347826086957e-05,
"loss": 1.2304,
"step": 130
},
{
"epoch": 0.05494370722853148,
"grad_norm": 1.7728719097748167,
"learning_rate": 1.2173913043478263e-05,
"loss": 1.2688,
"step": 140
},
{
"epoch": 0.05886825774485516,
"grad_norm": 4.131355974368673,
"learning_rate": 1.3043478260869566e-05,
"loss": 1.2972,
"step": 150
},
{
"epoch": 0.06279280826117883,
"grad_norm": 2.799659453356947,
"learning_rate": 1.391304347826087e-05,
"loss": 1.2953,
"step": 160
},
{
"epoch": 0.06671735877750251,
"grad_norm": 2.111913097871217,
"learning_rate": 1.4782608695652174e-05,
"loss": 1.2443,
"step": 170
},
{
"epoch": 0.07064190929382619,
"grad_norm": 2.1912441872440005,
"learning_rate": 1.565217391304348e-05,
"loss": 1.236,
"step": 180
},
{
"epoch": 0.07456645981014987,
"grad_norm": 2.234098380713297,
"learning_rate": 1.6521739130434785e-05,
"loss": 1.259,
"step": 190
},
{
"epoch": 0.07849101032647354,
"grad_norm": 4.198102404688399,
"learning_rate": 1.739130434782609e-05,
"loss": 1.2817,
"step": 200
},
{
"epoch": 0.08241556084279722,
"grad_norm": 3.019782787557433,
"learning_rate": 1.8260869565217393e-05,
"loss": 1.2664,
"step": 210
},
{
"epoch": 0.0863401113591209,
"grad_norm": 2.0685750522996167,
"learning_rate": 1.9130434782608697e-05,
"loss": 1.2373,
"step": 220
},
{
"epoch": 0.09026466187544457,
"grad_norm": 4.300018110272735,
"learning_rate": 2e-05,
"loss": 1.2275,
"step": 230
},
{
"epoch": 0.09418921239176825,
"grad_norm": 1.7354497070996773,
"learning_rate": 1.9999910223238215e-05,
"loss": 1.2408,
"step": 240
},
{
"epoch": 0.09811376290809193,
"grad_norm": 3.4204284789804036,
"learning_rate": 1.999964089456483e-05,
"loss": 1.2965,
"step": 250
},
{
"epoch": 0.1020383134244156,
"grad_norm": 2.4941748811333215,
"learning_rate": 1.9999192018815737e-05,
"loss": 1.2779,
"step": 260
},
{
"epoch": 0.10596286394073928,
"grad_norm": 1.9457868113217196,
"learning_rate": 1.999856360405066e-05,
"loss": 1.2323,
"step": 270
},
{
"epoch": 0.10988741445706296,
"grad_norm": 1.6849461858955408,
"learning_rate": 1.9997755661553007e-05,
"loss": 1.2056,
"step": 280
},
{
"epoch": 0.11381196497338664,
"grad_norm": 1.867640144655699,
"learning_rate": 1.9996768205829667e-05,
"loss": 1.2369,
"step": 290
},
{
"epoch": 0.11773651548971031,
"grad_norm": 3.3865066094085785,
"learning_rate": 1.9995601254610757e-05,
"loss": 1.2683,
"step": 300
},
{
"epoch": 0.12166106600603399,
"grad_norm": 2.397528528715023,
"learning_rate": 1.99942548288493e-05,
"loss": 1.2799,
"step": 310
},
{
"epoch": 0.12558561652235767,
"grad_norm": 2.5820404632260487,
"learning_rate": 1.9992728952720842e-05,
"loss": 1.2019,
"step": 320
},
{
"epoch": 0.12951016703868135,
"grad_norm": 1.7151124312301378,
"learning_rate": 1.9991023653623028e-05,
"loss": 1.2133,
"step": 330
},
{
"epoch": 0.13343471755500502,
"grad_norm": 2.0107220916427164,
"learning_rate": 1.9989138962175105e-05,
"loss": 1.2405,
"step": 340
},
{
"epoch": 0.1373592680713287,
"grad_norm": 2.8574288343315897,
"learning_rate": 1.998707491221737e-05,
"loss": 1.2588,
"step": 350
},
{
"epoch": 0.14128381858765238,
"grad_norm": 2.4054121950102756,
"learning_rate": 1.9984831540810567e-05,
"loss": 1.2581,
"step": 360
},
{
"epoch": 0.14520836910397605,
"grad_norm": 1.8485304425199756,
"learning_rate": 1.9982408888235224e-05,
"loss": 1.1938,
"step": 370
},
{
"epoch": 0.14913291962029973,
"grad_norm": 1.5828252143457966,
"learning_rate": 1.997980699799092e-05,
"loss": 1.1862,
"step": 380
},
{
"epoch": 0.1530574701366234,
"grad_norm": 1.2627808446236994,
"learning_rate": 1.9977025916795503e-05,
"loss": 1.2135,
"step": 390
},
{
"epoch": 0.15698202065294709,
"grad_norm": 3.2564505072901664,
"learning_rate": 1.997406569458428e-05,
"loss": 1.2506,
"step": 400
},
{
"epoch": 0.16090657116927076,
"grad_norm": 2.3862130817749825,
"learning_rate": 1.997092638450907e-05,
"loss": 1.2391,
"step": 410
},
{
"epoch": 0.16483112168559444,
"grad_norm": 1.8315079964245025,
"learning_rate": 1.9967608042937303e-05,
"loss": 1.1829,
"step": 420
},
{
"epoch": 0.16875567220191812,
"grad_norm": 1.8430052290908208,
"learning_rate": 1.9964110729450966e-05,
"loss": 1.205,
"step": 430
},
{
"epoch": 0.1726802227182418,
"grad_norm": 1.5980747844446883,
"learning_rate": 1.9960434506845555e-05,
"loss": 1.2007,
"step": 440
},
{
"epoch": 0.17660477323456547,
"grad_norm": 2.942407149873092,
"learning_rate": 1.9956579441128942e-05,
"loss": 1.2365,
"step": 450
},
{
"epoch": 0.18052932375088915,
"grad_norm": 2.379751902649777,
"learning_rate": 1.995254560152019e-05,
"loss": 1.2276,
"step": 460
},
{
"epoch": 0.18445387426721283,
"grad_norm": 1.7688369917233489,
"learning_rate": 1.9948333060448314e-05,
"loss": 1.1968,
"step": 470
},
{
"epoch": 0.1883784247835365,
"grad_norm": 1.589083721261629,
"learning_rate": 1.994394189355097e-05,
"loss": 1.195,
"step": 480
},
{
"epoch": 0.19230297529986018,
"grad_norm": 1.5496967150579632,
"learning_rate": 1.9939372179673104e-05,
"loss": 1.1999,
"step": 490
},
{
"epoch": 0.19622752581618386,
"grad_norm": 2.718076034623976,
"learning_rate": 1.9934624000865542e-05,
"loss": 1.2349,
"step": 500
},
{
"epoch": 0.20015207633250753,
"grad_norm": 2.3890997580290567,
"learning_rate": 1.9929697442383514e-05,
"loss": 1.2326,
"step": 510
},
{
"epoch": 0.2040766268488312,
"grad_norm": 1.9579908821335397,
"learning_rate": 1.9924592592685105e-05,
"loss": 1.1975,
"step": 520
},
{
"epoch": 0.2080011773651549,
"grad_norm": 1.5776501334782151,
"learning_rate": 1.991930954342969e-05,
"loss": 1.1816,
"step": 530
},
{
"epoch": 0.21192572788147856,
"grad_norm": 1.1161863427171768,
"learning_rate": 1.9913848389476283e-05,
"loss": 1.1491,
"step": 540
},
{
"epoch": 0.21585027839780224,
"grad_norm": 2.648197492260032,
"learning_rate": 1.9908209228881826e-05,
"loss": 1.2435,
"step": 550
},
{
"epoch": 0.21977482891412592,
"grad_norm": 2.408343139002311,
"learning_rate": 1.990239216289944e-05,
"loss": 1.2316,
"step": 560
},
{
"epoch": 0.2236993794304496,
"grad_norm": 1.8673008806345932,
"learning_rate": 1.9896397295976585e-05,
"loss": 1.1606,
"step": 570
},
{
"epoch": 0.22762392994677327,
"grad_norm": 1.5191789990510323,
"learning_rate": 1.9890224735753215e-05,
"loss": 1.1647,
"step": 580
},
{
"epoch": 0.23154848046309695,
"grad_norm": 1.2422189101767849,
"learning_rate": 1.9883874593059825e-05,
"loss": 1.1751,
"step": 590
},
{
"epoch": 0.23547303097942063,
"grad_norm": 2.942648489159468,
"learning_rate": 1.987734698191546e-05,
"loss": 1.2334,
"step": 600
},
{
"epoch": 0.2393975814957443,
"grad_norm": 2.4573102563716294,
"learning_rate": 1.987064201952568e-05,
"loss": 1.237,
"step": 610
},
{
"epoch": 0.24332213201206798,
"grad_norm": 1.7727471642184118,
"learning_rate": 1.9863759826280446e-05,
"loss": 1.1665,
"step": 620
},
{
"epoch": 0.24724668252839166,
"grad_norm": 1.5235132834345957,
"learning_rate": 1.9856700525751967e-05,
"loss": 1.1724,
"step": 630
},
{
"epoch": 0.25117123304471534,
"grad_norm": 1.2376130762267767,
"learning_rate": 1.9849464244692465e-05,
"loss": 1.1712,
"step": 640
},
{
"epoch": 0.255095783561039,
"grad_norm": 3.040275337730361,
"learning_rate": 1.9842051113031922e-05,
"loss": 1.2033,
"step": 650
},
{
"epoch": 0.2590203340773627,
"grad_norm": 2.3310773148731445,
"learning_rate": 1.9834461263875728e-05,
"loss": 1.2291,
"step": 660
},
{
"epoch": 0.26294488459368637,
"grad_norm": 1.7613401699558136,
"learning_rate": 1.9826694833502295e-05,
"loss": 1.2002,
"step": 670
},
{
"epoch": 0.26686943511001004,
"grad_norm": 1.4212023760208006,
"learning_rate": 1.9818751961360623e-05,
"loss": 1.1806,
"step": 680
},
{
"epoch": 0.2707939856263337,
"grad_norm": 1.5547355362596973,
"learning_rate": 1.9810632790067773e-05,
"loss": 1.203,
"step": 690
},
{
"epoch": 0.2747185361426574,
"grad_norm": 2.864900900676303,
"learning_rate": 1.9802337465406332e-05,
"loss": 1.2255,
"step": 700
},
{
"epoch": 0.2786430866589811,
"grad_norm": 2.273750000509486,
"learning_rate": 1.9793866136321775e-05,
"loss": 1.2176,
"step": 710
},
{
"epoch": 0.28256763717530475,
"grad_norm": 1.8215540424542227,
"learning_rate": 1.97852189549198e-05,
"loss": 1.159,
"step": 720
},
{
"epoch": 0.28649218769162843,
"grad_norm": 1.4139436194360697,
"learning_rate": 1.9776396076463597e-05,
"loss": 1.1552,
"step": 730
},
{
"epoch": 0.2904167382079521,
"grad_norm": 1.2805284263458057,
"learning_rate": 1.9767397659371058e-05,
"loss": 1.1857,
"step": 740
},
{
"epoch": 0.2943412887242758,
"grad_norm": 2.6772231556409025,
"learning_rate": 1.975822386521193e-05,
"loss": 1.2332,
"step": 750
},
{
"epoch": 0.29826583924059946,
"grad_norm": 2.283135985364224,
"learning_rate": 1.974887485870492e-05,
"loss": 1.203,
"step": 760
},
{
"epoch": 0.30219038975692314,
"grad_norm": 1.6219262272093848,
"learning_rate": 1.973935080771474e-05,
"loss": 1.1696,
"step": 770
},
{
"epoch": 0.3061149402732468,
"grad_norm": 1.492581338030124,
"learning_rate": 1.9729651883249075e-05,
"loss": 1.1732,
"step": 780
},
{
"epoch": 0.3100394907895705,
"grad_norm": 1.0942797906848452,
"learning_rate": 1.9719778259455533e-05,
"loss": 1.1665,
"step": 790
},
{
"epoch": 0.31396404130589417,
"grad_norm": 2.7812656388019694,
"learning_rate": 1.9709730113618507e-05,
"loss": 1.224,
"step": 800
},
{
"epoch": 0.31788859182221785,
"grad_norm": 2.2668037248648494,
"learning_rate": 1.9699507626156e-05,
"loss": 1.2128,
"step": 810
},
{
"epoch": 0.3218131423385415,
"grad_norm": 1.653446544884032,
"learning_rate": 1.9689110980616374e-05,
"loss": 1.1697,
"step": 820
},
{
"epoch": 0.3257376928548652,
"grad_norm": 1.6562721449500364,
"learning_rate": 1.967854036367506e-05,
"loss": 1.1786,
"step": 830
},
{
"epoch": 0.3296622433711889,
"grad_norm": 1.3990148712849042,
"learning_rate": 1.9667795965131215e-05,
"loss": 1.1814,
"step": 840
},
{
"epoch": 0.33358679388751256,
"grad_norm": 2.709511506933081,
"learning_rate": 1.96568779779043e-05,
"loss": 1.2135,
"step": 850
},
{
"epoch": 0.33751134440383623,
"grad_norm": 2.1617540677777303,
"learning_rate": 1.9645786598030617e-05,
"loss": 1.1925,
"step": 860
},
{
"epoch": 0.3414358949201599,
"grad_norm": 1.6647622938579412,
"learning_rate": 1.9634522024659802e-05,
"loss": 1.1654,
"step": 870
},
{
"epoch": 0.3453604454364836,
"grad_norm": 1.445420557740109,
"learning_rate": 1.9623084460051246e-05,
"loss": 1.1318,
"step": 880
},
{
"epoch": 0.34928499595280726,
"grad_norm": 1.2317074284184588,
"learning_rate": 1.9611474109570446e-05,
"loss": 1.1489,
"step": 890
},
{
"epoch": 0.35320954646913094,
"grad_norm": 2.5433820603816497,
"learning_rate": 1.9599691181685335e-05,
"loss": 1.2242,
"step": 900
},
{
"epoch": 0.3571340969854546,
"grad_norm": 2.147149465279269,
"learning_rate": 1.9587735887962533e-05,
"loss": 1.2087,
"step": 910
},
{
"epoch": 0.3610586475017783,
"grad_norm": 1.6551085595896637,
"learning_rate": 1.957560844306356e-05,
"loss": 1.1401,
"step": 920
},
{
"epoch": 0.364983198018102,
"grad_norm": 1.511777154462307,
"learning_rate": 1.9563309064740955e-05,
"loss": 1.1597,
"step": 930
},
{
"epoch": 0.36890774853442565,
"grad_norm": 1.1094232746787396,
"learning_rate": 1.955083797383439e-05,
"loss": 1.1569,
"step": 940
},
{
"epoch": 0.3728322990507493,
"grad_norm": 2.572815297222484,
"learning_rate": 1.95381953942667e-05,
"loss": 1.2189,
"step": 950
},
{
"epoch": 0.376756849567073,
"grad_norm": 2.333811966623515,
"learning_rate": 1.9525381553039852e-05,
"loss": 1.2014,
"step": 960
},
{
"epoch": 0.3806814000833967,
"grad_norm": 1.6717294454527916,
"learning_rate": 1.951239668023088e-05,
"loss": 1.1645,
"step": 970
},
{
"epoch": 0.38460595059972036,
"grad_norm": 1.5026637953972823,
"learning_rate": 1.9499241008987758e-05,
"loss": 1.1632,
"step": 980
},
{
"epoch": 0.38853050111604404,
"grad_norm": 1.2986707472970862,
"learning_rate": 1.9485914775525193e-05,
"loss": 1.1644,
"step": 990
},
{
"epoch": 0.3924550516323677,
"grad_norm": 2.9685551602402573,
"learning_rate": 1.9472418219120403e-05,
"loss": 1.1866,
"step": 1000
},
{
"epoch": 0.3963796021486914,
"grad_norm": 2.234634727478329,
"learning_rate": 1.945875158210881e-05,
"loss": 1.2102,
"step": 1010
},
{
"epoch": 0.40030415266501507,
"grad_norm": 1.724781393052345,
"learning_rate": 1.9444915109879704e-05,
"loss": 1.1415,
"step": 1020
},
{
"epoch": 0.40422870318133874,
"grad_norm": 1.4455353570141956,
"learning_rate": 1.9430909050871815e-05,
"loss": 1.1638,
"step": 1030
},
{
"epoch": 0.4081532536976624,
"grad_norm": 1.1969939749246539,
"learning_rate": 1.9416733656568868e-05,
"loss": 1.1527,
"step": 1040
},
{
"epoch": 0.4120778042139861,
"grad_norm": 2.5537155941782026,
"learning_rate": 1.9402389181495063e-05,
"loss": 1.2141,
"step": 1050
},
{
"epoch": 0.4160023547303098,
"grad_norm": 2.08351625708691,
"learning_rate": 1.9387875883210507e-05,
"loss": 1.1907,
"step": 1060
},
{
"epoch": 0.41992690524663345,
"grad_norm": 1.683716206941947,
"learning_rate": 1.937319402230658e-05,
"loss": 1.1538,
"step": 1070
},
{
"epoch": 0.42385145576295713,
"grad_norm": 1.4654454547439344,
"learning_rate": 1.935834386240127e-05,
"loss": 1.1601,
"step": 1080
},
{
"epoch": 0.4277760062792808,
"grad_norm": 1.1890264813491744,
"learning_rate": 1.934332567013443e-05,
"loss": 1.1569,
"step": 1090
},
{
"epoch": 0.4317005567956045,
"grad_norm": 2.491064388570012,
"learning_rate": 1.9328139715162994e-05,
"loss": 1.2189,
"step": 1100
},
{
"epoch": 0.43562510731192816,
"grad_norm": 2.1439066913442444,
"learning_rate": 1.9312786270156135e-05,
"loss": 1.1932,
"step": 1110
},
{
"epoch": 0.43954965782825184,
"grad_norm": 1.669653473033775,
"learning_rate": 1.9297265610790373e-05,
"loss": 1.1387,
"step": 1120
},
{
"epoch": 0.4434742083445755,
"grad_norm": 1.3843825017667384,
"learning_rate": 1.9281578015744603e-05,
"loss": 1.1376,
"step": 1130
},
{
"epoch": 0.4473987588608992,
"grad_norm": 1.1753467269379858,
"learning_rate": 1.9265723766695135e-05,
"loss": 1.1481,
"step": 1140
},
{
"epoch": 0.45132330937722287,
"grad_norm": 2.6835572127286818,
"learning_rate": 1.9249703148310588e-05,
"loss": 1.1767,
"step": 1150
},
{
"epoch": 0.45524785989354655,
"grad_norm": 2.3542813633364106,
"learning_rate": 1.9233516448246815e-05,
"loss": 1.2115,
"step": 1160
},
{
"epoch": 0.4591724104098702,
"grad_norm": 1.7788410116492113,
"learning_rate": 1.9217163957141716e-05,
"loss": 1.1595,
"step": 1170
},
{
"epoch": 0.4630969609261939,
"grad_norm": 1.589344468558335,
"learning_rate": 1.9200645968610036e-05,
"loss": 1.1469,
"step": 1180
},
{
"epoch": 0.4670215114425176,
"grad_norm": 1.3723348105531945,
"learning_rate": 1.918396277923807e-05,
"loss": 1.1257,
"step": 1190
},
{
"epoch": 0.47094606195884126,
"grad_norm": 2.3205173632902674,
"learning_rate": 1.9167114688578368e-05,
"loss": 1.1712,
"step": 1200
},
{
"epoch": 0.47487061247516493,
"grad_norm": 2.20379995717616,
"learning_rate": 1.9150101999144338e-05,
"loss": 1.198,
"step": 1210
},
{
"epoch": 0.4787951629914886,
"grad_norm": 1.6571237734382616,
"learning_rate": 1.9132925016404805e-05,
"loss": 1.1346,
"step": 1220
},
{
"epoch": 0.4827197135078123,
"grad_norm": 1.5174786415787016,
"learning_rate": 1.911558404877855e-05,
"loss": 1.1382,
"step": 1230
},
{
"epoch": 0.48664426402413596,
"grad_norm": 1.0610840707954994,
"learning_rate": 1.909807940762876e-05,
"loss": 1.1223,
"step": 1240
},
{
"epoch": 0.49056881454045964,
"grad_norm": 2.628300567508133,
"learning_rate": 1.908041140725743e-05,
"loss": 1.1758,
"step": 1250
},
{
"epoch": 0.4944933650567833,
"grad_norm": 2.1593460670180655,
"learning_rate": 1.9062580364899735e-05,
"loss": 1.2182,
"step": 1260
},
{
"epoch": 0.498417915573107,
"grad_norm": 1.6850637902364638,
"learning_rate": 1.9044586600718323e-05,
"loss": 1.1582,
"step": 1270
},
{
"epoch": 0.5023424660894307,
"grad_norm": 1.6079103318853967,
"learning_rate": 1.9026430437797568e-05,
"loss": 1.1213,
"step": 1280
},
{
"epoch": 0.5062670166057543,
"grad_norm": 1.2518743683340756,
"learning_rate": 1.9008112202137777e-05,
"loss": 1.1546,
"step": 1290
},
{
"epoch": 0.510191567122078,
"grad_norm": 2.4116262224448057,
"learning_rate": 1.898963222264932e-05,
"loss": 1.1807,
"step": 1300
},
{
"epoch": 0.5141161176384017,
"grad_norm": 2.301017308903272,
"learning_rate": 1.8970990831146744e-05,
"loss": 1.1837,
"step": 1310
},
{
"epoch": 0.5180406681547254,
"grad_norm": 1.69892059072323,
"learning_rate": 1.8952188362342804e-05,
"loss": 1.1347,
"step": 1320
},
{
"epoch": 0.5219652186710491,
"grad_norm": 1.399901320658771,
"learning_rate": 1.8933225153842446e-05,
"loss": 1.1464,
"step": 1330
},
{
"epoch": 0.5258897691873727,
"grad_norm": 1.0557641784260816,
"learning_rate": 1.8914101546136766e-05,
"loss": 1.1349,
"step": 1340
},
{
"epoch": 0.5298143197036964,
"grad_norm": 2.5362209062888295,
"learning_rate": 1.889481788259688e-05,
"loss": 1.1834,
"step": 1350
},
{
"epoch": 0.5337388702200201,
"grad_norm": 2.1100281299468278,
"learning_rate": 1.8875374509467757e-05,
"loss": 1.2085,
"step": 1360
},
{
"epoch": 0.5376634207363438,
"grad_norm": 1.656879350518556,
"learning_rate": 1.8855771775862014e-05,
"loss": 1.145,
"step": 1370
},
{
"epoch": 0.5415879712526674,
"grad_norm": 1.3835758185671234,
"learning_rate": 1.8836010033753637e-05,
"loss": 1.1332,
"step": 1380
},
{
"epoch": 0.5455125217689911,
"grad_norm": 1.3074526959475135,
"learning_rate": 1.8816089637971674e-05,
"loss": 1.1337,
"step": 1390
},
{
"epoch": 0.5494370722853148,
"grad_norm": 2.471754592942074,
"learning_rate": 1.879601094619385e-05,
"loss": 1.177,
"step": 1400
},
{
"epoch": 0.5533616228016385,
"grad_norm": 2.0297870153949953,
"learning_rate": 1.877577431894015e-05,
"loss": 1.2251,
"step": 1410
},
{
"epoch": 0.5572861733179622,
"grad_norm": 1.6193945226941358,
"learning_rate": 1.8755380119566343e-05,
"loss": 1.0928,
"step": 1420
},
{
"epoch": 0.5612107238342858,
"grad_norm": 1.4634472002355838,
"learning_rate": 1.873482871425747e-05,
"loss": 1.143,
"step": 1430
},
{
"epoch": 0.5651352743506095,
"grad_norm": 1.2878805152891477,
"learning_rate": 1.8714120472021252e-05,
"loss": 1.1712,
"step": 1440
},
{
"epoch": 0.5690598248669332,
"grad_norm": 2.6107789403264965,
"learning_rate": 1.8693255764681476e-05,
"loss": 1.1793,
"step": 1450
},
{
"epoch": 0.5729843753832569,
"grad_norm": 2.101138870961313,
"learning_rate": 1.867223496687131e-05,
"loss": 1.1724,
"step": 1460
},
{
"epoch": 0.5769089258995805,
"grad_norm": 1.559869838184,
"learning_rate": 1.865105845602659e-05,
"loss": 1.1569,
"step": 1470
},
{
"epoch": 0.5808334764159042,
"grad_norm": 1.4484698696907943,
"learning_rate": 1.8629726612379034e-05,
"loss": 1.1461,
"step": 1480
},
{
"epoch": 0.5847580269322279,
"grad_norm": 1.1551387246395677,
"learning_rate": 1.86082398189494e-05,
"loss": 1.1276,
"step": 1490
},
{
"epoch": 0.5886825774485516,
"grad_norm": 2.3243966610365208,
"learning_rate": 1.8586598461540647e-05,
"loss": 1.1865,
"step": 1500
},
{
"epoch": 0.5926071279648752,
"grad_norm": 2.045560797585921,
"learning_rate": 1.8564802928730963e-05,
"loss": 1.1981,
"step": 1510
},
{
"epoch": 0.5965316784811989,
"grad_norm": 1.5774145920172018,
"learning_rate": 1.8542853611866826e-05,
"loss": 1.1475,
"step": 1520
},
{
"epoch": 0.6004562289975226,
"grad_norm": 1.401143146614057,
"learning_rate": 1.8520750905055948e-05,
"loss": 1.1113,
"step": 1530
},
{
"epoch": 0.6043807795138463,
"grad_norm": 1.0993576375496286,
"learning_rate": 1.849849520516023e-05,
"loss": 1.1196,
"step": 1540
},
{
"epoch": 0.60830533003017,
"grad_norm": 2.6837789697900694,
"learning_rate": 1.8476086911788588e-05,
"loss": 1.1731,
"step": 1550
},
{
"epoch": 0.6122298805464936,
"grad_norm": 2.2269178857118166,
"learning_rate": 1.8453526427289836e-05,
"loss": 1.1673,
"step": 1560
},
{
"epoch": 0.6161544310628173,
"grad_norm": 1.6753938116918217,
"learning_rate": 1.8430814156745424e-05,
"loss": 1.1212,
"step": 1570
},
{
"epoch": 0.620078981579141,
"grad_norm": 1.2926994229597162,
"learning_rate": 1.8407950507962166e-05,
"loss": 1.12,
"step": 1580
},
{
"epoch": 0.6240035320954647,
"grad_norm": 1.1434774002781025,
"learning_rate": 1.8384935891464938e-05,
"loss": 1.1059,
"step": 1590
},
{
"epoch": 0.6279280826117883,
"grad_norm": 2.5701272662383623,
"learning_rate": 1.8361770720489287e-05,
"loss": 1.1667,
"step": 1600
},
{
"epoch": 0.631852633128112,
"grad_norm": 2.099590587250419,
"learning_rate": 1.8338455410974017e-05,
"loss": 1.1811,
"step": 1610
},
{
"epoch": 0.6357771836444357,
"grad_norm": 1.5881595852499024,
"learning_rate": 1.831499038155373e-05,
"loss": 1.1198,
"step": 1620
},
{
"epoch": 0.6397017341607594,
"grad_norm": 1.364474265009956,
"learning_rate": 1.8291376053551293e-05,
"loss": 1.1348,
"step": 1630
},
{
"epoch": 0.643626284677083,
"grad_norm": 1.0132232392193459,
"learning_rate": 1.8267612850970292e-05,
"loss": 1.1341,
"step": 1640
},
{
"epoch": 0.6475508351934067,
"grad_norm": 2.375671216469934,
"learning_rate": 1.824370120048739e-05,
"loss": 1.1971,
"step": 1650
},
{
"epoch": 0.6514753857097304,
"grad_norm": 2.0072312065319142,
"learning_rate": 1.8219641531444713e-05,
"loss": 1.1696,
"step": 1660
},
{
"epoch": 0.6553999362260541,
"grad_norm": 1.621521304969733,
"learning_rate": 1.8195434275842088e-05,
"loss": 1.1116,
"step": 1670
},
{
"epoch": 0.6593244867423778,
"grad_norm": 1.289974630938439,
"learning_rate": 1.817107986832932e-05,
"loss": 1.1427,
"step": 1680
},
{
"epoch": 0.6632490372587014,
"grad_norm": 1.2226882453760828,
"learning_rate": 1.8146578746198374e-05,
"loss": 1.1324,
"step": 1690
},
{
"epoch": 0.6671735877750251,
"grad_norm": 2.6497361969234836,
"learning_rate": 1.812193134937554e-05,
"loss": 1.1518,
"step": 1700
},
{
"epoch": 0.6710981382913488,
"grad_norm": 1.934779659069536,
"learning_rate": 1.8097138120413503e-05,
"loss": 1.1667,
"step": 1710
},
{
"epoch": 0.6750226888076725,
"grad_norm": 1.682274234607041,
"learning_rate": 1.8072199504483428e-05,
"loss": 1.1094,
"step": 1720
},
{
"epoch": 0.6789472393239961,
"grad_norm": 1.3189385241228773,
"learning_rate": 1.8047115949366955e-05,
"loss": 1.1485,
"step": 1730
},
{
"epoch": 0.6828717898403198,
"grad_norm": 1.2380330513347648,
"learning_rate": 1.8021887905448146e-05,
"loss": 1.1228,
"step": 1740
},
{
"epoch": 0.6867963403566435,
"grad_norm": 2.503615625647334,
"learning_rate": 1.799651582570543e-05,
"loss": 1.1545,
"step": 1750
},
{
"epoch": 0.6907208908729672,
"grad_norm": 1.9970401432155471,
"learning_rate": 1.7971000165703434e-05,
"loss": 1.1698,
"step": 1760
},
{
"epoch": 0.6946454413892909,
"grad_norm": 1.585682831800493,
"learning_rate": 1.7945341383584818e-05,
"loss": 1.12,
"step": 1770
},
{
"epoch": 0.6985699919056145,
"grad_norm": 1.4103033699462193,
"learning_rate": 1.7919539940062068e-05,
"loss": 1.1375,
"step": 1780
},
{
"epoch": 0.7024945424219382,
"grad_norm": 1.1741968139844532,
"learning_rate": 1.7893596298409182e-05,
"loss": 1.1045,
"step": 1790
},
{
"epoch": 0.7064190929382619,
"grad_norm": 2.5435233808457265,
"learning_rate": 1.7867510924453394e-05,
"loss": 1.1561,
"step": 1800
},
{
"epoch": 0.7103436434545856,
"grad_norm": 2.0058708638995744,
"learning_rate": 1.784128428656678e-05,
"loss": 1.1905,
"step": 1810
},
{
"epoch": 0.7142681939709092,
"grad_norm": 1.5513764227014477,
"learning_rate": 1.7814916855657872e-05,
"loss": 1.116,
"step": 1820
},
{
"epoch": 0.7181927444872329,
"grad_norm": 1.3841452634663314,
"learning_rate": 1.7788409105163178e-05,
"loss": 1.1359,
"step": 1830
},
{
"epoch": 0.7221172950035566,
"grad_norm": 1.0616071904873385,
"learning_rate": 1.7761761511038694e-05,
"loss": 1.0973,
"step": 1840
},
{
"epoch": 0.7260418455198803,
"grad_norm": 2.4472932789694726,
"learning_rate": 1.773497455175137e-05,
"loss": 1.1611,
"step": 1850
},
{
"epoch": 0.729966396036204,
"grad_norm": 2.1478318961127325,
"learning_rate": 1.7708048708270497e-05,
"loss": 1.1637,
"step": 1860
},
{
"epoch": 0.7338909465525276,
"grad_norm": 1.5986355699917554,
"learning_rate": 1.7680984464059077e-05,
"loss": 1.1179,
"step": 1870
},
{
"epoch": 0.7378154970688513,
"grad_norm": 1.2845396362764854,
"learning_rate": 1.7653782305065158e-05,
"loss": 1.1407,
"step": 1880
},
{
"epoch": 0.741740047585175,
"grad_norm": 1.1067586294132603,
"learning_rate": 1.7626442719713083e-05,
"loss": 1.1255,
"step": 1890
},
{
"epoch": 0.7456645981014987,
"grad_norm": 2.2098441511746705,
"learning_rate": 1.7598966198894746e-05,
"loss": 1.1756,
"step": 1900
},
{
"epoch": 0.7495891486178223,
"grad_norm": 1.9535681972911683,
"learning_rate": 1.7571353235960754e-05,
"loss": 1.1813,
"step": 1910
},
{
"epoch": 0.753513699134146,
"grad_norm": 1.5198953597892402,
"learning_rate": 1.7543604326711592e-05,
"loss": 1.1157,
"step": 1920
},
{
"epoch": 0.7574382496504697,
"grad_norm": 1.282264664888777,
"learning_rate": 1.7515719969388697e-05,
"loss": 1.1325,
"step": 1930
},
{
"epoch": 0.7613628001667934,
"grad_norm": 1.1529117257538906,
"learning_rate": 1.7487700664665536e-05,
"loss": 1.1579,
"step": 1940
},
{
"epoch": 0.765287350683117,
"grad_norm": 2.4479153069810877,
"learning_rate": 1.7459546915638595e-05,
"loss": 1.1548,
"step": 1950
},
{
"epoch": 0.7692119011994407,
"grad_norm": 1.9930358021042167,
"learning_rate": 1.743125922781836e-05,
"loss": 1.1702,
"step": 1960
},
{
"epoch": 0.7731364517157644,
"grad_norm": 1.5561233301314203,
"learning_rate": 1.740283810912023e-05,
"loss": 1.1098,
"step": 1970
},
{
"epoch": 0.7770610022320881,
"grad_norm": 1.363877169879588,
"learning_rate": 1.737428406985541e-05,
"loss": 1.1276,
"step": 1980
},
{
"epoch": 0.7809855527484117,
"grad_norm": 1.1118223017408846,
"learning_rate": 1.7345597622721727e-05,
"loss": 1.1143,
"step": 1990
},
{
"epoch": 0.7849101032647354,
"grad_norm": 2.2103639445489707,
"learning_rate": 1.7316779282794458e-05,
"loss": 1.1436,
"step": 2000
},
{
"epoch": 0.7888346537810591,
"grad_norm": 1.9866216610680039,
"learning_rate": 1.728782956751705e-05,
"loss": 1.1366,
"step": 2010
},
{
"epoch": 0.7927592042973828,
"grad_norm": 1.557276150793299,
"learning_rate": 1.725874899669183e-05,
"loss": 1.1028,
"step": 2020
},
{
"epoch": 0.7966837548137065,
"grad_norm": 1.3407353791664398,
"learning_rate": 1.7229538092470708e-05,
"loss": 1.121,
"step": 2030
},
{
"epoch": 0.8006083053300301,
"grad_norm": 1.0452399263705143,
"learning_rate": 1.7200197379345752e-05,
"loss": 1.1052,
"step": 2040
},
{
"epoch": 0.8045328558463538,
"grad_norm": 2.255129030276742,
"learning_rate": 1.7170727384139808e-05,
"loss": 1.1534,
"step": 2050
},
{
"epoch": 0.8084574063626775,
"grad_norm": 2.023922420597953,
"learning_rate": 1.7141128635997027e-05,
"loss": 1.1536,
"step": 2060
},
{
"epoch": 0.8123819568790012,
"grad_norm": 1.6459699923272906,
"learning_rate": 1.711140166637336e-05,
"loss": 1.1237,
"step": 2070
},
{
"epoch": 0.8163065073953248,
"grad_norm": 1.3995835514158206,
"learning_rate": 1.7081547009027014e-05,
"loss": 1.1364,
"step": 2080
},
{
"epoch": 0.8202310579116485,
"grad_norm": 1.1468866803278337,
"learning_rate": 1.705156520000889e-05,
"loss": 1.1055,
"step": 2090
},
{
"epoch": 0.8241556084279722,
"grad_norm": 2.4534252010588626,
"learning_rate": 1.702145677765293e-05,
"loss": 1.1233,
"step": 2100
},
{
"epoch": 0.8280801589442959,
"grad_norm": 2.069228109517972,
"learning_rate": 1.6991222282566465e-05,
"loss": 1.1368,
"step": 2110
},
{
"epoch": 0.8320047094606196,
"grad_norm": 1.5353360725588796,
"learning_rate": 1.696086225762051e-05,
"loss": 1.0936,
"step": 2120
},
{
"epoch": 0.8359292599769432,
"grad_norm": 1.31441567730565,
"learning_rate": 1.6930377247940005e-05,
"loss": 1.103,
"step": 2130
},
{
"epoch": 0.8398538104932669,
"grad_norm": 0.9169234319686077,
"learning_rate": 1.689976780089405e-05,
"loss": 1.0933,
"step": 2140
},
{
"epoch": 0.8437783610095906,
"grad_norm": 2.321894947275435,
"learning_rate": 1.6869034466086046e-05,
"loss": 1.1397,
"step": 2150
},
{
"epoch": 0.8477029115259143,
"grad_norm": 2.0305512220158386,
"learning_rate": 1.6838177795343847e-05,
"loss": 1.1704,
"step": 2160
},
{
"epoch": 0.8516274620422379,
"grad_norm": 1.5512604200141975,
"learning_rate": 1.6807198342709858e-05,
"loss": 1.1113,
"step": 2170
},
{
"epoch": 0.8555520125585616,
"grad_norm": 1.3624069523785742,
"learning_rate": 1.677609666443105e-05,
"loss": 1.1355,
"step": 2180
},
{
"epoch": 0.8594765630748853,
"grad_norm": 1.162420454215084,
"learning_rate": 1.6744873318949032e-05,
"loss": 1.1217,
"step": 2190
},
{
"epoch": 0.863401113591209,
"grad_norm": 2.59296418698937,
"learning_rate": 1.6713528866889966e-05,
"loss": 1.1753,
"step": 2200
},
{
"epoch": 0.8673256641075326,
"grad_norm": 2.005406504075907,
"learning_rate": 1.6682063871054534e-05,
"loss": 1.1596,
"step": 2210
},
{
"epoch": 0.8712502146238563,
"grad_norm": 1.5322297264828286,
"learning_rate": 1.6650478896407825e-05,
"loss": 1.1093,
"step": 2220
},
{
"epoch": 0.87517476514018,
"grad_norm": 1.3164373320619593,
"learning_rate": 1.6618774510069187e-05,
"loss": 1.109,
"step": 2230
},
{
"epoch": 0.8790993156565037,
"grad_norm": 0.979869187444681,
"learning_rate": 1.6586951281302046e-05,
"loss": 1.1212,
"step": 2240
},
{
"epoch": 0.8830238661728274,
"grad_norm": 2.4909588716430617,
"learning_rate": 1.655500978150369e-05,
"loss": 1.1286,
"step": 2250
},
{
"epoch": 0.886948416689151,
"grad_norm": 2.022377055680929,
"learning_rate": 1.6522950584195003e-05,
"loss": 1.149,
"step": 2260
},
{
"epoch": 0.8908729672054747,
"grad_norm": 1.522868856655693,
"learning_rate": 1.649077426501017e-05,
"loss": 1.0971,
"step": 2270
},
{
"epoch": 0.8947975177217984,
"grad_norm": 1.3697259871875316,
"learning_rate": 1.6458481401686334e-05,
"loss": 1.0805,
"step": 2280
},
{
"epoch": 0.8987220682381221,
"grad_norm": 1.018922203182855,
"learning_rate": 1.6426072574053238e-05,
"loss": 1.1299,
"step": 2290
},
{
"epoch": 0.9026466187544457,
"grad_norm": 2.375176209683022,
"learning_rate": 1.6393548364022803e-05,
"loss": 1.1244,
"step": 2300
},
{
"epoch": 0.9065711692707694,
"grad_norm": 1.9892422302580408,
"learning_rate": 1.636090935557868e-05,
"loss": 1.1825,
"step": 2310
},
{
"epoch": 0.9104957197870931,
"grad_norm": 1.5724093274080302,
"learning_rate": 1.632815613476576e-05,
"loss": 1.1016,
"step": 2320
},
{
"epoch": 0.9144202703034168,
"grad_norm": 1.2589150941461418,
"learning_rate": 1.6295289289679674e-05,
"loss": 1.1056,
"step": 2330
},
{
"epoch": 0.9183448208197404,
"grad_norm": 0.946685256837567,
"learning_rate": 1.62623094104562e-05,
"loss": 1.0829,
"step": 2340
},
{
"epoch": 0.9222693713360641,
"grad_norm": 2.352958803717975,
"learning_rate": 1.6229217089260695e-05,
"loss": 1.1514,
"step": 2350
},
{
"epoch": 0.9261939218523878,
"grad_norm": 1.9500631427530646,
"learning_rate": 1.6196012920277436e-05,
"loss": 1.1563,
"step": 2360
},
{
"epoch": 0.9301184723687115,
"grad_norm": 1.5860068083635046,
"learning_rate": 1.616269749969899e-05,
"loss": 1.0999,
"step": 2370
},
{
"epoch": 0.9340430228850352,
"grad_norm": 1.2393963203713174,
"learning_rate": 1.6129271425715458e-05,
"loss": 1.1056,
"step": 2380
},
{
"epoch": 0.9379675734013588,
"grad_norm": 1.0108366563366444,
"learning_rate": 1.609573529850379e-05,
"loss": 1.0886,
"step": 2390
},
{
"epoch": 0.9418921239176825,
"grad_norm": 2.307306825085365,
"learning_rate": 1.6062089720216956e-05,
"loss": 1.125,
"step": 2400
},
{
"epoch": 0.9458166744340062,
"grad_norm": 2.08922761238031,
"learning_rate": 1.6028335294973182e-05,
"loss": 1.1676,
"step": 2410
},
{
"epoch": 0.9497412249503299,
"grad_norm": 1.5072970519469342,
"learning_rate": 1.5994472628845054e-05,
"loss": 1.0805,
"step": 2420
},
{
"epoch": 0.9536657754666535,
"grad_norm": 1.208673658456983,
"learning_rate": 1.5960502329848683e-05,
"loss": 1.1023,
"step": 2430
},
{
"epoch": 0.9575903259829772,
"grad_norm": 1.0701544169565054,
"learning_rate": 1.5926425007932747e-05,
"loss": 1.0802,
"step": 2440
},
{
"epoch": 0.9615148764993009,
"grad_norm": 2.2259127710545745,
"learning_rate": 1.5892241274967578e-05,
"loss": 1.1306,
"step": 2450
},
{
"epoch": 0.9654394270156246,
"grad_norm": 1.9613490972509378,
"learning_rate": 1.5857951744734145e-05,
"loss": 1.1527,
"step": 2460
},
{
"epoch": 0.9693639775319483,
"grad_norm": 1.6339352423393527,
"learning_rate": 1.5823557032913045e-05,
"loss": 1.1173,
"step": 2470
},
{
"epoch": 0.9732885280482719,
"grad_norm": 1.2845543184685153,
"learning_rate": 1.5789057757073444e-05,
"loss": 1.0858,
"step": 2480
},
{
"epoch": 0.9772130785645956,
"grad_norm": 1.0634701364462926,
"learning_rate": 1.5754454536662e-05,
"loss": 1.0772,
"step": 2490
},
{
"epoch": 0.9811376290809193,
"grad_norm": 2.5457603854360618,
"learning_rate": 1.5719747992991723e-05,
"loss": 1.1572,
"step": 2500
},
{
"epoch": 0.985062179597243,
"grad_norm": 1.9734906496943037,
"learning_rate": 1.568493874923084e-05,
"loss": 1.1277,
"step": 2510
},
{
"epoch": 0.9889867301135666,
"grad_norm": 1.5783237318057073,
"learning_rate": 1.5650027430391584e-05,
"loss": 1.0856,
"step": 2520
},
{
"epoch": 0.9929112806298903,
"grad_norm": 1.4264738203605272,
"learning_rate": 1.5615014663318993e-05,
"loss": 1.1078,
"step": 2530
},
{
"epoch": 0.996835831146214,
"grad_norm": 1.1620086010183999,
"learning_rate": 1.5579901076679625e-05,
"loss": 1.1097,
"step": 2540
},
{
"epoch": 0.999975471559273,
"eval_loss": 0.9125259518623352,
"eval_runtime": 1520.6591,
"eval_samples_per_second": 16.44,
"eval_steps_per_second": 4.11,
"step": 2548
},
{
"epoch": 1.0007849101032646,
"grad_norm": 1.0252573802297382,
"learning_rate": 1.5544687300950306e-05,
"loss": 0.9338,
"step": 2550
},
{
"epoch": 1.0047094606195883,
"grad_norm": 2.201134957778082,
"learning_rate": 1.5509373968406792e-05,
"loss": 0.9016,
"step": 2560
},
{
"epoch": 1.008634011135912,
"grad_norm": 1.7915224952217608,
"learning_rate": 1.5473961713112405e-05,
"loss": 0.991,
"step": 2570
},
{
"epoch": 1.0125585616522357,
"grad_norm": 1.7104141742083112,
"learning_rate": 1.5438451170906672e-05,
"loss": 0.9134,
"step": 2580
},
{
"epoch": 1.0164831121685594,
"grad_norm": 1.488033671149185,
"learning_rate": 1.5402842979393882e-05,
"loss": 0.8688,
"step": 2590
},
{
"epoch": 1.020407662684883,
"grad_norm": 1.1133098743882353,
"learning_rate": 1.5367137777931673e-05,
"loss": 0.8432,
"step": 2600
},
{
"epoch": 1.0243322132012067,
"grad_norm": 2.0585351825127143,
"learning_rate": 1.5331336207619507e-05,
"loss": 0.8874,
"step": 2610
},
{
"epoch": 1.0282567637175304,
"grad_norm": 1.867901103345647,
"learning_rate": 1.5295438911287203e-05,
"loss": 0.9336,
"step": 2620
},
{
"epoch": 1.032181314233854,
"grad_norm": 1.6563402725505043,
"learning_rate": 1.5259446533483357e-05,
"loss": 0.8879,
"step": 2630
},
{
"epoch": 1.0361058647501777,
"grad_norm": 1.4856230129943386,
"learning_rate": 1.5223359720463796e-05,
"loss": 0.859,
"step": 2640
},
{
"epoch": 1.0400304152665014,
"grad_norm": 0.9711013069770768,
"learning_rate": 1.5187179120179969e-05,
"loss": 0.8288,
"step": 2650
},
{
"epoch": 1.043954965782825,
"grad_norm": 1.9828694802282283,
"learning_rate": 1.5150905382267299e-05,
"loss": 0.8955,
"step": 2660
},
{
"epoch": 1.0478795162991488,
"grad_norm": 2.0020449073070283,
"learning_rate": 1.511453915803353e-05,
"loss": 0.9694,
"step": 2670
},
{
"epoch": 1.0518040668154724,
"grad_norm": 1.6466177405453537,
"learning_rate": 1.5078081100447035e-05,
"loss": 0.9115,
"step": 2680
},
{
"epoch": 1.0557286173317961,
"grad_norm": 1.7037544484554887,
"learning_rate": 1.5041531864125082e-05,
"loss": 0.8493,
"step": 2690
},
{
"epoch": 1.0596531678481198,
"grad_norm": 0.9649533896621292,
"learning_rate": 1.5004892105322092e-05,
"loss": 0.8204,
"step": 2700
},
{
"epoch": 1.0635777183644435,
"grad_norm": 2.104909863577972,
"learning_rate": 1.4968162481917836e-05,
"loss": 0.9002,
"step": 2710
},
{
"epoch": 1.0675022688807672,
"grad_norm": 2.0341252133554146,
"learning_rate": 1.4931343653405652e-05,
"loss": 0.9456,
"step": 2720
},
{
"epoch": 1.0714268193970908,
"grad_norm": 1.5915526204280668,
"learning_rate": 1.4894436280880578e-05,
"loss": 0.8801,
"step": 2730
},
{
"epoch": 1.0753513699134145,
"grad_norm": 1.5184799515062875,
"learning_rate": 1.4857441027027486e-05,
"loss": 0.8608,
"step": 2740
},
{
"epoch": 1.0792759204297382,
"grad_norm": 1.0739540966168113,
"learning_rate": 1.4820358556109202e-05,
"loss": 0.8383,
"step": 2750
},
{
"epoch": 1.0832004709460619,
"grad_norm": 2.3527635362598787,
"learning_rate": 1.4783189533954555e-05,
"loss": 0.8989,
"step": 2760
},
{
"epoch": 1.0871250214623855,
"grad_norm": 1.8879516919488128,
"learning_rate": 1.4745934627946432e-05,
"loss": 0.9203,
"step": 2770
},
{
"epoch": 1.0910495719787092,
"grad_norm": 1.6794025897082823,
"learning_rate": 1.4708594507009806e-05,
"loss": 0.8939,
"step": 2780
},
{
"epoch": 1.094974122495033,
"grad_norm": 1.263872598631979,
"learning_rate": 1.4671169841599695e-05,
"loss": 0.8435,
"step": 2790
},
{
"epoch": 1.0988986730113566,
"grad_norm": 0.9984868253029158,
"learning_rate": 1.4633661303689157e-05,
"loss": 0.8568,
"step": 2800
},
{
"epoch": 1.1028232235276803,
"grad_norm": 2.060249688323629,
"learning_rate": 1.4596069566757207e-05,
"loss": 0.8882,
"step": 2810
},
{
"epoch": 1.106747774044004,
"grad_norm": 1.920918977602427,
"learning_rate": 1.4558395305776731e-05,
"loss": 0.9299,
"step": 2820
},
{
"epoch": 1.1106723245603276,
"grad_norm": 1.6211510195551506,
"learning_rate": 1.4520639197202355e-05,
"loss": 0.8898,
"step": 2830
},
{
"epoch": 1.1145968750766513,
"grad_norm": 1.229775633396699,
"learning_rate": 1.4482801918958312e-05,
"loss": 0.8609,
"step": 2840
},
{
"epoch": 1.118521425592975,
"grad_norm": 1.1229141852110762,
"learning_rate": 1.4444884150426267e-05,
"loss": 0.8456,
"step": 2850
},
{
"epoch": 1.1224459761092986,
"grad_norm": 2.120911251605639,
"learning_rate": 1.4406886572433113e-05,
"loss": 0.8918,
"step": 2860
},
{
"epoch": 1.1263705266256223,
"grad_norm": 2.0769262910789767,
"learning_rate": 1.4368809867238754e-05,
"loss": 0.9531,
"step": 2870
},
{
"epoch": 1.130295077141946,
"grad_norm": 1.6225603088474205,
"learning_rate": 1.4330654718523847e-05,
"loss": 0.8979,
"step": 2880
},
{
"epoch": 1.1342196276582697,
"grad_norm": 1.1891920263768887,
"learning_rate": 1.4292421811377532e-05,
"loss": 0.8697,
"step": 2890
},
{
"epoch": 1.1381441781745933,
"grad_norm": 1.0159234075717296,
"learning_rate": 1.4254111832285128e-05,
"loss": 0.8353,
"step": 2900
},
{
"epoch": 1.142068728690917,
"grad_norm": 2.0861820849148383,
"learning_rate": 1.4215725469115806e-05,
"loss": 0.8676,
"step": 2910
},
{
"epoch": 1.1459932792072407,
"grad_norm": 1.9828582401916874,
"learning_rate": 1.4177263411110249e-05,
"loss": 0.9457,
"step": 2920
},
{
"epoch": 1.1499178297235644,
"grad_norm": 1.6363978446581915,
"learning_rate": 1.413872634886825e-05,
"loss": 0.8682,
"step": 2930
},
{
"epoch": 1.153842380239888,
"grad_norm": 1.414190445025758,
"learning_rate": 1.4100114974336352e-05,
"loss": 0.8663,
"step": 2940
},
{
"epoch": 1.1577669307562117,
"grad_norm": 0.9877954021628765,
"learning_rate": 1.4061429980795382e-05,
"loss": 0.8439,
"step": 2950
},
{
"epoch": 1.1616914812725354,
"grad_norm": 2.2358539386859726,
"learning_rate": 1.4022672062848034e-05,
"loss": 0.8819,
"step": 2960
},
{
"epoch": 1.165616031788859,
"grad_norm": 1.8768050513117698,
"learning_rate": 1.3983841916406383e-05,
"loss": 0.9261,
"step": 2970
},
{
"epoch": 1.1695405823051828,
"grad_norm": 1.6017361554290517,
"learning_rate": 1.3944940238679384e-05,
"loss": 0.898,
"step": 2980
},
{
"epoch": 1.1734651328215064,
"grad_norm": 1.2526189890877126,
"learning_rate": 1.390596772816037e-05,
"loss": 0.8496,
"step": 2990
},
{
"epoch": 1.1773896833378301,
"grad_norm": 1.0365383728001132,
"learning_rate": 1.3866925084614501e-05,
"loss": 0.8468,
"step": 3000
},
{
"epoch": 1.1813142338541538,
"grad_norm": 2.224248057811622,
"learning_rate": 1.3827813009066202e-05,
"loss": 0.8759,
"step": 3010
},
{
"epoch": 1.1852387843704775,
"grad_norm": 1.8752890091264613,
"learning_rate": 1.3788632203786567e-05,
"loss": 0.9297,
"step": 3020
},
{
"epoch": 1.1891633348868011,
"grad_norm": 1.6600876268313813,
"learning_rate": 1.374938337228076e-05,
"loss": 0.87,
"step": 3030
},
{
"epoch": 1.1930878854031248,
"grad_norm": 1.5276898634308227,
"learning_rate": 1.3710067219275382e-05,
"loss": 0.8693,
"step": 3040
},
{
"epoch": 1.1970124359194485,
"grad_norm": 0.9561545850483195,
"learning_rate": 1.3670684450705813e-05,
"loss": 0.8369,
"step": 3050
},
{
"epoch": 1.2009369864357722,
"grad_norm": 2.3827528867634307,
"learning_rate": 1.3631235773703535e-05,
"loss": 0.8932,
"step": 3060
},
{
"epoch": 1.2048615369520959,
"grad_norm": 2.0991202817563828,
"learning_rate": 1.3591721896583455e-05,
"loss": 0.9404,
"step": 3070
},
{
"epoch": 1.2087860874684195,
"grad_norm": 1.6251446131482838,
"learning_rate": 1.3552143528831149e-05,
"loss": 0.8804,
"step": 3080
},
{
"epoch": 1.2127106379847432,
"grad_norm": 1.3451655629852488,
"learning_rate": 1.3512501381090158e-05,
"loss": 0.8529,
"step": 3090
},
{
"epoch": 1.2166351885010669,
"grad_norm": 0.8385047694025927,
"learning_rate": 1.3472796165149217e-05,
"loss": 0.818,
"step": 3100
},
{
"epoch": 1.2205597390173906,
"grad_norm": 2.267246006997812,
"learning_rate": 1.3433028593929467e-05,
"loss": 0.8779,
"step": 3110
},
{
"epoch": 1.2244842895337142,
"grad_norm": 2.0703651009730035,
"learning_rate": 1.3393199381471657e-05,
"loss": 0.9371,
"step": 3120
},
{
"epoch": 1.228408840050038,
"grad_norm": 1.6728269310666124,
"learning_rate": 1.3353309242923336e-05,
"loss": 0.862,
"step": 3130
},
{
"epoch": 1.2323333905663616,
"grad_norm": 1.3943213550602511,
"learning_rate": 1.3313358894525997e-05,
"loss": 0.8734,
"step": 3140
},
{
"epoch": 1.2362579410826853,
"grad_norm": 0.9417267831082166,
"learning_rate": 1.327334905360222e-05,
"loss": 0.819,
"step": 3150
},
{
"epoch": 1.240182491599009,
"grad_norm": 2.03618285439807,
"learning_rate": 1.3233280438542795e-05,
"loss": 0.8671,
"step": 3160
},
{
"epoch": 1.2441070421153326,
"grad_norm": 1.8648736136084698,
"learning_rate": 1.319315376879383e-05,
"loss": 0.945,
"step": 3170
},
{
"epoch": 1.2480315926316563,
"grad_norm": 1.5679297693032792,
"learning_rate": 1.3152969764843812e-05,
"loss": 0.8778,
"step": 3180
},
{
"epoch": 1.25195614314798,
"grad_norm": 1.375990035837938,
"learning_rate": 1.3112729148210694e-05,
"loss": 0.8501,
"step": 3190
},
{
"epoch": 1.2558806936643037,
"grad_norm": 0.8945069929861201,
"learning_rate": 1.3072432641428931e-05,
"loss": 0.8555,
"step": 3200
},
{
"epoch": 1.2598052441806273,
"grad_norm": 1.9973442354473332,
"learning_rate": 1.3032080968036498e-05,
"loss": 0.8922,
"step": 3210
},
{
"epoch": 1.263729794696951,
"grad_norm": 2.1062119639954444,
"learning_rate": 1.2991674852561904e-05,
"loss": 0.929,
"step": 3220
},
{
"epoch": 1.2676543452132747,
"grad_norm": 1.6116944823889532,
"learning_rate": 1.2951215020511196e-05,
"loss": 0.8672,
"step": 3230
},
{
"epoch": 1.2715788957295984,
"grad_norm": 1.330101731052534,
"learning_rate": 1.2910702198354915e-05,
"loss": 0.853,
"step": 3240
},
{
"epoch": 1.275503446245922,
"grad_norm": 1.0644575098102675,
"learning_rate": 1.2870137113515053e-05,
"loss": 0.8281,
"step": 3250
},
{
"epoch": 1.2794279967622457,
"grad_norm": 2.4521439306633748,
"learning_rate": 1.2829520494352004e-05,
"loss": 0.8696,
"step": 3260
},
{
"epoch": 1.2833525472785694,
"grad_norm": 2.0163802307404466,
"learning_rate": 1.2788853070151477e-05,
"loss": 0.9172,
"step": 3270
},
{
"epoch": 1.287277097794893,
"grad_norm": 1.6281065777711845,
"learning_rate": 1.2748135571111404e-05,
"loss": 0.8644,
"step": 3280
},
{
"epoch": 1.2912016483112168,
"grad_norm": 1.5009876264848987,
"learning_rate": 1.2707368728328826e-05,
"loss": 0.8654,
"step": 3290
},
{
"epoch": 1.2951261988275404,
"grad_norm": 0.7853723853615091,
"learning_rate": 1.2666553273786771e-05,
"loss": 0.8031,
"step": 3300
},
{
"epoch": 1.299050749343864,
"grad_norm": 2.189696126410242,
"learning_rate": 1.2625689940341102e-05,
"loss": 0.8795,
"step": 3310
},
{
"epoch": 1.3029752998601878,
"grad_norm": 1.9334298716685878,
"learning_rate": 1.2584779461707374e-05,
"loss": 0.926,
"step": 3320
},
{
"epoch": 1.3068998503765115,
"grad_norm": 1.5481872721801182,
"learning_rate": 1.254382257244765e-05,
"loss": 0.8565,
"step": 3330
},
{
"epoch": 1.3108244008928351,
"grad_norm": 1.2607713095759323,
"learning_rate": 1.2502820007957302e-05,
"loss": 0.8505,
"step": 3340
},
{
"epoch": 1.3147489514091588,
"grad_norm": 0.8930555325270236,
"learning_rate": 1.2461772504451822e-05,
"loss": 0.8323,
"step": 3350
},
{
"epoch": 1.3186735019254825,
"grad_norm": 1.9338117374514918,
"learning_rate": 1.2420680798953604e-05,
"loss": 0.8754,
"step": 3360
},
{
"epoch": 1.3225980524418062,
"grad_norm": 2.061421402861005,
"learning_rate": 1.2379545629278693e-05,
"loss": 0.9426,
"step": 3370
},
{
"epoch": 1.3265226029581298,
"grad_norm": 1.588986245387613,
"learning_rate": 1.233836773402356e-05,
"loss": 0.867,
"step": 3380
},
{
"epoch": 1.3304471534744535,
"grad_norm": 1.2886545052157778,
"learning_rate": 1.229714785255182e-05,
"loss": 0.868,
"step": 3390
},
{
"epoch": 1.3343717039907772,
"grad_norm": 1.219703452074695,
"learning_rate": 1.2255886724980974e-05,
"loss": 0.8168,
"step": 3400
},
{
"epoch": 1.3382962545071009,
"grad_norm": 2.236888625402866,
"learning_rate": 1.2214585092169103e-05,
"loss": 0.8794,
"step": 3410
},
{
"epoch": 1.3422208050234246,
"grad_norm": 2.13039765109174,
"learning_rate": 1.2173243695701575e-05,
"loss": 0.9218,
"step": 3420
},
{
"epoch": 1.3461453555397482,
"grad_norm": 1.652775626275893,
"learning_rate": 1.213186327787773e-05,
"loss": 0.8937,
"step": 3430
},
{
"epoch": 1.350069906056072,
"grad_norm": 1.469984852272106,
"learning_rate": 1.209044458169756e-05,
"loss": 0.8582,
"step": 3440
},
{
"epoch": 1.3539944565723956,
"grad_norm": 0.8843940147907386,
"learning_rate": 1.2048988350848338e-05,
"loss": 0.8537,
"step": 3450
},
{
"epoch": 1.3579190070887193,
"grad_norm": 2.286808087504479,
"learning_rate": 1.2007495329691301e-05,
"loss": 0.8915,
"step": 3460
},
{
"epoch": 1.361843557605043,
"grad_norm": 2.1637511011528585,
"learning_rate": 1.1965966263248267e-05,
"loss": 0.9545,
"step": 3470
},
{
"epoch": 1.3657681081213666,
"grad_norm": 1.615967978820237,
"learning_rate": 1.192440189718825e-05,
"loss": 0.889,
"step": 3480
},
{
"epoch": 1.3696926586376903,
"grad_norm": 1.3561598955089822,
"learning_rate": 1.1882802977814092e-05,
"loss": 0.8568,
"step": 3490
},
{
"epoch": 1.373617209154014,
"grad_norm": 1.031049725329889,
"learning_rate": 1.184117025204905e-05,
"loss": 0.8065,
"step": 3500
},
{
"epoch": 1.3775417596703377,
"grad_norm": 2.130942563304923,
"learning_rate": 1.1799504467423382e-05,
"loss": 0.8781,
"step": 3510
},
{
"epoch": 1.3814663101866613,
"grad_norm": 1.914562387879776,
"learning_rate": 1.1757806372060934e-05,
"loss": 0.9244,
"step": 3520
},
{
"epoch": 1.385390860702985,
"grad_norm": 1.6890782058344322,
"learning_rate": 1.1716076714665701e-05,
"loss": 0.8621,
"step": 3530
},
{
"epoch": 1.3893154112193087,
"grad_norm": 1.2006002638188844,
"learning_rate": 1.1674316244508381e-05,
"loss": 0.8393,
"step": 3540
},
{
"epoch": 1.3932399617356324,
"grad_norm": 1.168530668598062,
"learning_rate": 1.1632525711412936e-05,
"loss": 0.8402,
"step": 3550
},
{
"epoch": 1.397164512251956,
"grad_norm": 2.214073914981144,
"learning_rate": 1.1590705865743108e-05,
"loss": 0.8794,
"step": 3560
},
{
"epoch": 1.4010890627682797,
"grad_norm": 1.9175558977832043,
"learning_rate": 1.1548857458388967e-05,
"loss": 0.9226,
"step": 3570
},
{
"epoch": 1.4050136132846034,
"grad_norm": 1.7516445773007234,
"learning_rate": 1.1506981240753406e-05,
"loss": 0.8688,
"step": 3580
},
{
"epoch": 1.408938163800927,
"grad_norm": 1.236950976064499,
"learning_rate": 1.1465077964738674e-05,
"loss": 0.8604,
"step": 3590
},
{
"epoch": 1.4128627143172507,
"grad_norm": 0.9946584575727137,
"learning_rate": 1.1423148382732854e-05,
"loss": 0.8198,
"step": 3600
},
{
"epoch": 1.4167872648335744,
"grad_norm": 2.357981326812003,
"learning_rate": 1.1381193247596365e-05,
"loss": 0.8815,
"step": 3610
},
{
"epoch": 1.420711815349898,
"grad_norm": 2.068839516965018,
"learning_rate": 1.133921331264844e-05,
"loss": 0.9218,
"step": 3620
},
{
"epoch": 1.4246363658662218,
"grad_norm": 1.7728344994302516,
"learning_rate": 1.1297209331653606e-05,
"loss": 0.8679,
"step": 3630
},
{
"epoch": 1.4285609163825455,
"grad_norm": 1.2048121431631056,
"learning_rate": 1.1255182058808143e-05,
"loss": 0.8544,
"step": 3640
},
{
"epoch": 1.4324854668988691,
"grad_norm": 1.333275738578586,
"learning_rate": 1.1213132248726541e-05,
"loss": 0.8409,
"step": 3650
},
{
"epoch": 1.4364100174151928,
"grad_norm": 2.206900104214621,
"learning_rate": 1.1171060656427957e-05,
"loss": 0.881,
"step": 3660
},
{
"epoch": 1.4403345679315165,
"grad_norm": 2.0032350194617923,
"learning_rate": 1.1128968037322654e-05,
"loss": 0.9304,
"step": 3670
},
{
"epoch": 1.4442591184478402,
"grad_norm": 1.899875816582961,
"learning_rate": 1.1086855147198442e-05,
"loss": 0.8756,
"step": 3680
},
{
"epoch": 1.4481836689641638,
"grad_norm": 1.1750450632734881,
"learning_rate": 1.1044722742207102e-05,
"loss": 0.8516,
"step": 3690
},
{
"epoch": 1.4521082194804875,
"grad_norm": 1.0654206477147208,
"learning_rate": 1.1002571578850808e-05,
"loss": 0.8287,
"step": 3700
},
{
"epoch": 1.4560327699968112,
"grad_norm": 2.1156979827088698,
"learning_rate": 1.0960402413968552e-05,
"loss": 0.8747,
"step": 3710
},
{
"epoch": 1.4599573205131349,
"grad_norm": 1.9613993823327838,
"learning_rate": 1.0918216004722551e-05,
"loss": 0.9248,
"step": 3720
},
{
"epoch": 1.4638818710294585,
"grad_norm": 1.5896298448753268,
"learning_rate": 1.0876013108584644e-05,
"loss": 0.862,
"step": 3730
},
{
"epoch": 1.4678064215457822,
"grad_norm": 1.2663265403929282,
"learning_rate": 1.08337944833227e-05,
"loss": 0.8671,
"step": 3740
},
{
"epoch": 1.471730972062106,
"grad_norm": 0.8402022117082641,
"learning_rate": 1.0791560886987016e-05,
"loss": 0.8089,
"step": 3750
},
{
"epoch": 1.4756555225784296,
"grad_norm": 2.2895246842794346,
"learning_rate": 1.0749313077896697e-05,
"loss": 0.8865,
"step": 3760
},
{
"epoch": 1.4795800730947533,
"grad_norm": 2.005690344314872,
"learning_rate": 1.0707051814626035e-05,
"loss": 0.9195,
"step": 3770
},
{
"epoch": 1.483504623611077,
"grad_norm": 1.6528392426188365,
"learning_rate": 1.0664777855990909e-05,
"loss": 0.8482,
"step": 3780
},
{
"epoch": 1.4874291741274006,
"grad_norm": 1.3252339862500955,
"learning_rate": 1.062249196103514e-05,
"loss": 0.8633,
"step": 3790
},
{
"epoch": 1.4913537246437243,
"grad_norm": 0.8384617708635065,
"learning_rate": 1.0580194889016866e-05,
"loss": 0.8424,
"step": 3800
},
{
"epoch": 1.495278275160048,
"grad_norm": 2.2376729601071013,
"learning_rate": 1.0537887399394926e-05,
"loss": 0.8698,
"step": 3810
},
{
"epoch": 1.4992028256763716,
"grad_norm": 2.0054017581982158,
"learning_rate": 1.0495570251815204e-05,
"loss": 0.9146,
"step": 3820
},
{
"epoch": 1.5031273761926953,
"grad_norm": 1.620117177752491,
"learning_rate": 1.0453244206096993e-05,
"loss": 0.86,
"step": 3830
},
{
"epoch": 1.507051926709019,
"grad_norm": 1.2414222987018593,
"learning_rate": 1.0410910022219356e-05,
"loss": 0.8462,
"step": 3840
},
{
"epoch": 1.5109764772253427,
"grad_norm": 0.985434847255323,
"learning_rate": 1.0368568460307482e-05,
"loss": 0.8374,
"step": 3850
},
{
"epoch": 1.5149010277416664,
"grad_norm": 2.414664203183688,
"learning_rate": 1.0326220280619036e-05,
"loss": 0.8643,
"step": 3860
},
{
"epoch": 1.51882557825799,
"grad_norm": 1.959661011938626,
"learning_rate": 1.0283866243530506e-05,
"loss": 0.9216,
"step": 3870
},
{
"epoch": 1.5227501287743137,
"grad_norm": 1.8216460410768873,
"learning_rate": 1.0241507109523551e-05,
"loss": 0.8557,
"step": 3880
},
{
"epoch": 1.5266746792906374,
"grad_norm": 1.2379402019260293,
"learning_rate": 1.019914363917135e-05,
"loss": 0.8528,
"step": 3890
},
{
"epoch": 1.530599229806961,
"grad_norm": 1.1791841499933475,
"learning_rate": 1.0156776593124933e-05,
"loss": 0.8409,
"step": 3900
},
{
"epoch": 1.5345237803232847,
"grad_norm": 2.2744852259625294,
"learning_rate": 1.0114406732099549e-05,
"loss": 0.877,
"step": 3910
},
{
"epoch": 1.5384483308396084,
"grad_norm": 1.9131218031571517,
"learning_rate": 1.0072034816860979e-05,
"loss": 0.9287,
"step": 3920
},
{
"epoch": 1.542372881355932,
"grad_norm": 1.7840885206492576,
"learning_rate": 1.0029661608211884e-05,
"loss": 0.8511,
"step": 3930
},
{
"epoch": 1.5462974318722558,
"grad_norm": 1.227268938632673,
"learning_rate": 9.987287866978169e-06,
"loss": 0.8535,
"step": 3940
},
{
"epoch": 1.5502219823885794,
"grad_norm": 1.1621128799647606,
"learning_rate": 9.944914353995277e-06,
"loss": 0.8447,
"step": 3950
},
{
"epoch": 1.5541465329049031,
"grad_norm": 2.2074040426072483,
"learning_rate": 9.90254183009457e-06,
"loss": 0.8529,
"step": 3960
},
{
"epoch": 1.5580710834212268,
"grad_norm": 1.8956680374875223,
"learning_rate": 9.860171056089646e-06,
"loss": 0.9103,
"step": 3970
},
{
"epoch": 1.5619956339375505,
"grad_norm": 1.8417783539473633,
"learning_rate": 9.817802792762675e-06,
"loss": 0.8619,
"step": 3980
},
{
"epoch": 1.5659201844538742,
"grad_norm": 1.1239027585518444,
"learning_rate": 9.775437800850764e-06,
"loss": 0.8405,
"step": 3990
},
{
"epoch": 1.5698447349701978,
"grad_norm": 1.093899462684469,
"learning_rate": 9.73307684103226e-06,
"loss": 0.8409,
"step": 4000
},
{
"epoch": 1.5737692854865215,
"grad_norm": 2.2458835890160698,
"learning_rate": 9.690720673913135e-06,
"loss": 0.8331,
"step": 4010
},
{
"epoch": 1.5776938360028452,
"grad_norm": 1.9788980494060489,
"learning_rate": 9.648370060013279e-06,
"loss": 0.9097,
"step": 4020
},
{
"epoch": 1.5816183865191689,
"grad_norm": 1.627824694857222,
"learning_rate": 9.606025759752895e-06,
"loss": 0.8831,
"step": 4030
},
{
"epoch": 1.5855429370354925,
"grad_norm": 1.2178062700728904,
"learning_rate": 9.56368853343882e-06,
"loss": 0.8462,
"step": 4040
},
{
"epoch": 1.5894674875518162,
"grad_norm": 0.8688900774056482,
"learning_rate": 9.52135914125086e-06,
"loss": 0.8132,
"step": 4050
},
{
"epoch": 1.59339203806814,
"grad_norm": 2.223343280091266,
"learning_rate": 9.479038343228173e-06,
"loss": 0.8987,
"step": 4060
},
{
"epoch": 1.5973165885844636,
"grad_norm": 1.9082308916293935,
"learning_rate": 9.436726899255596e-06,
"loss": 0.9305,
"step": 4070
},
{
"epoch": 1.6012411391007872,
"grad_norm": 1.6281376379032095,
"learning_rate": 9.394425569050018e-06,
"loss": 0.8806,
"step": 4080
},
{
"epoch": 1.605165689617111,
"grad_norm": 1.3465856506420029,
"learning_rate": 9.352135112146726e-06,
"loss": 0.8553,
"step": 4090
},
{
"epoch": 1.6090902401334346,
"grad_norm": 0.8680202506295652,
"learning_rate": 9.309856287885775e-06,
"loss": 0.8224,
"step": 4100
},
{
"epoch": 1.6130147906497583,
"grad_norm": 2.0551986939006266,
"learning_rate": 9.267589855398356e-06,
"loss": 0.866,
"step": 4110
},
{
"epoch": 1.616939341166082,
"grad_norm": 2.009568214502001,
"learning_rate": 9.22533657359315e-06,
"loss": 0.9291,
"step": 4120
},
{
"epoch": 1.6208638916824056,
"grad_norm": 1.63900979800811,
"learning_rate": 9.183097201142722e-06,
"loss": 0.8596,
"step": 4130
},
{
"epoch": 1.6247884421987293,
"grad_norm": 1.270695193750196,
"learning_rate": 9.140872496469891e-06,
"loss": 0.8496,
"step": 4140
},
{
"epoch": 1.628712992715053,
"grad_norm": 0.800400571374905,
"learning_rate": 9.098663217734102e-06,
"loss": 0.8171,
"step": 4150
},
{
"epoch": 1.6326375432313767,
"grad_norm": 2.239858328605921,
"learning_rate": 9.056470122817836e-06,
"loss": 0.8696,
"step": 4160
},
{
"epoch": 1.6365620937477003,
"grad_norm": 2.0001741452838444,
"learning_rate": 9.01429396931297e-06,
"loss": 0.9174,
"step": 4170
},
{
"epoch": 1.640486644264024,
"grad_norm": 1.6873988098563508,
"learning_rate": 8.972135514507212e-06,
"loss": 0.8725,
"step": 4180
},
{
"epoch": 1.6444111947803477,
"grad_norm": 1.1871894670080978,
"learning_rate": 8.92999551537046e-06,
"loss": 0.819,
"step": 4190
},
{
"epoch": 1.6483357452966714,
"grad_norm": 0.8255765266436247,
"learning_rate": 8.88787472854126e-06,
"loss": 0.8178,
"step": 4200
},
{
"epoch": 1.652260295812995,
"grad_norm": 2.1797960381842927,
"learning_rate": 8.845773910313168e-06,
"loss": 0.8486,
"step": 4210
},
{
"epoch": 1.6561848463293187,
"grad_norm": 1.9916857527591452,
"learning_rate": 8.803693816621218e-06,
"loss": 0.8947,
"step": 4220
},
{
"epoch": 1.6601093968456424,
"grad_norm": 1.7685848462081732,
"learning_rate": 8.761635203028319e-06,
"loss": 0.8766,
"step": 4230
},
{
"epoch": 1.664033947361966,
"grad_norm": 1.245584977919593,
"learning_rate": 8.719598824711694e-06,
"loss": 0.8337,
"step": 4240
},
{
"epoch": 1.6679584978782898,
"grad_norm": 1.0048598136091462,
"learning_rate": 8.677585436449332e-06,
"loss": 0.8163,
"step": 4250
},
{
"epoch": 1.6718830483946134,
"grad_norm": 2.398297940398292,
"learning_rate": 8.635595792606419e-06,
"loss": 0.8559,
"step": 4260
},
{
"epoch": 1.6758075989109371,
"grad_norm": 1.9730845143153721,
"learning_rate": 8.593630647121809e-06,
"loss": 0.895,
"step": 4270
},
{
"epoch": 1.6797321494272608,
"grad_norm": 1.6696729781305142,
"learning_rate": 8.551690753494476e-06,
"loss": 0.8508,
"step": 4280
},
{
"epoch": 1.6836566999435845,
"grad_norm": 1.3530608242464415,
"learning_rate": 8.509776864769982e-06,
"loss": 0.8295,
"step": 4290
},
{
"epoch": 1.6875812504599081,
"grad_norm": 0.8499210708063567,
"learning_rate": 8.467889733526977e-06,
"loss": 0.8245,
"step": 4300
},
{
"epoch": 1.6915058009762318,
"grad_norm": 2.3773533821815067,
"learning_rate": 8.426030111863654e-06,
"loss": 0.8521,
"step": 4310
},
{
"epoch": 1.6954303514925555,
"grad_norm": 2.077394013322105,
"learning_rate": 8.384198751384272e-06,
"loss": 0.9227,
"step": 4320
},
{
"epoch": 1.6993549020088792,
"grad_norm": 1.6718741898436833,
"learning_rate": 8.342396403185649e-06,
"loss": 0.8448,
"step": 4330
},
{
"epoch": 1.7032794525252029,
"grad_norm": 1.3584330125549005,
"learning_rate": 8.300623817843673e-06,
"loss": 0.8385,
"step": 4340
},
{
"epoch": 1.7072040030415265,
"grad_norm": 0.9817563909977678,
"learning_rate": 8.258881745399837e-06,
"loss": 0.8062,
"step": 4350
},
{
"epoch": 1.7111285535578502,
"grad_norm": 2.3439029636827127,
"learning_rate": 8.217170935347756e-06,
"loss": 0.8164,
"step": 4360
},
{
"epoch": 1.7150531040741739,
"grad_norm": 2.016656786265483,
"learning_rate": 8.17549213661973e-06,
"loss": 0.8954,
"step": 4370
},
{
"epoch": 1.7189776545904976,
"grad_norm": 1.7641936297392027,
"learning_rate": 8.133846097573263e-06,
"loss": 0.8658,
"step": 4380
},
{
"epoch": 1.7229022051068212,
"grad_norm": 1.289046077866149,
"learning_rate": 8.09223356597767e-06,
"loss": 0.8291,
"step": 4390
},
{
"epoch": 1.726826755623145,
"grad_norm": 0.8325407065873541,
"learning_rate": 8.050655289000612e-06,
"loss": 0.8168,
"step": 4400
},
{
"epoch": 1.7307513061394686,
"grad_norm": 2.6948112568790124,
"learning_rate": 8.009112013194707e-06,
"loss": 0.8495,
"step": 4410
},
{
"epoch": 1.7346758566557923,
"grad_norm": 1.9659980108114699,
"learning_rate": 7.96760448448411e-06,
"loss": 0.8947,
"step": 4420
},
{
"epoch": 1.738600407172116,
"grad_norm": 1.8751083414206937,
"learning_rate": 7.926133448151121e-06,
"loss": 0.8493,
"step": 4430
},
{
"epoch": 1.7425249576884396,
"grad_norm": 1.2387882054830557,
"learning_rate": 7.884699648822816e-06,
"loss": 0.8267,
"step": 4440
},
{
"epoch": 1.7464495082047633,
"grad_norm": 1.1399825715598682,
"learning_rate": 7.843303830457654e-06,
"loss": 0.791,
"step": 4450
},
{
"epoch": 1.750374058721087,
"grad_norm": 2.411702115321597,
"learning_rate": 7.801946736332144e-06,
"loss": 0.8578,
"step": 4460
},
{
"epoch": 1.7542986092374107,
"grad_norm": 2.125672710751084,
"learning_rate": 7.760629109027488e-06,
"loss": 0.8945,
"step": 4470
},
{
"epoch": 1.7582231597537343,
"grad_norm": 1.6589788634772225,
"learning_rate": 7.719351690416234e-06,
"loss": 0.8528,
"step": 4480
},
{
"epoch": 1.762147710270058,
"grad_norm": 1.2339299024543553,
"learning_rate": 7.678115221648983e-06,
"loss": 0.8264,
"step": 4490
},
{
"epoch": 1.7660722607863817,
"grad_norm": 0.8818646846150126,
"learning_rate": 7.636920443141057e-06,
"loss": 0.7858,
"step": 4500
},
{
"epoch": 1.7699968113027054,
"grad_norm": 2.227068159454661,
"learning_rate": 7.595768094559226e-06,
"loss": 0.8546,
"step": 4510
},
{
"epoch": 1.773921361819029,
"grad_norm": 2.0038773703981527,
"learning_rate": 7.554658914808404e-06,
"loss": 0.8974,
"step": 4520
},
{
"epoch": 1.7778459123353527,
"grad_norm": 1.559249536864396,
"learning_rate": 7.513593642018398e-06,
"loss": 0.8488,
"step": 4530
},
{
"epoch": 1.7817704628516764,
"grad_norm": 1.2969189118652353,
"learning_rate": 7.472573013530657e-06,
"loss": 0.8509,
"step": 4540
},
{
"epoch": 1.785695013368,
"grad_norm": 0.8958343326615786,
"learning_rate": 7.431597765885013e-06,
"loss": 0.7997,
"step": 4550
},
{
"epoch": 1.7896195638843237,
"grad_norm": 2.217098339251589,
"learning_rate": 7.39066863480648e-06,
"loss": 0.8348,
"step": 4560
},
{
"epoch": 1.7935441144006474,
"grad_norm": 2.1265537658801117,
"learning_rate": 7.349786355192023e-06,
"loss": 0.8944,
"step": 4570
},
{
"epoch": 1.797468664916971,
"grad_norm": 1.6797170624498905,
"learning_rate": 7.308951661097379e-06,
"loss": 0.8448,
"step": 4580
},
{
"epoch": 1.8013932154332948,
"grad_norm": 1.1786118411152537,
"learning_rate": 7.268165285723875e-06,
"loss": 0.8474,
"step": 4590
},
{
"epoch": 1.8053177659496185,
"grad_norm": 0.8503022381368873,
"learning_rate": 7.227427961405245e-06,
"loss": 0.7908,
"step": 4600
},
{
"epoch": 1.8092423164659421,
"grad_norm": 2.3703275938509525,
"learning_rate": 7.186740419594505e-06,
"loss": 0.845,
"step": 4610
},
{
"epoch": 1.8131668669822658,
"grad_norm": 2.0576172422877073,
"learning_rate": 7.1461033908508004e-06,
"loss": 0.9065,
"step": 4620
},
{
"epoch": 1.8170914174985895,
"grad_norm": 1.6463925064837197,
"learning_rate": 7.1055176048263085e-06,
"loss": 0.842,
"step": 4630
},
{
"epoch": 1.8210159680149132,
"grad_norm": 1.3242222800874879,
"learning_rate": 7.0649837902531095e-06,
"loss": 0.8499,
"step": 4640
},
{
"epoch": 1.8249405185312368,
"grad_norm": 0.8538674555412706,
"learning_rate": 7.0245026749301315e-06,
"loss": 0.8046,
"step": 4650
},
{
"epoch": 1.8288650690475605,
"grad_norm": 2.4490816400507516,
"learning_rate": 6.984074985710068e-06,
"loss": 0.8529,
"step": 4660
},
{
"epoch": 1.8327896195638842,
"grad_norm": 2.0294960511407822,
"learning_rate": 6.943701448486313e-06,
"loss": 0.8992,
"step": 4670
},
{
"epoch": 1.8367141700802079,
"grad_norm": 1.680488532778669,
"learning_rate": 6.903382788179962e-06,
"loss": 0.8566,
"step": 4680
},
{
"epoch": 1.8406387205965316,
"grad_norm": 1.2281066012509496,
"learning_rate": 6.8631197287267636e-06,
"loss": 0.8376,
"step": 4690
},
{
"epoch": 1.8445632711128552,
"grad_norm": 1.2188089446996344,
"learning_rate": 6.82291299306414e-06,
"loss": 0.8058,
"step": 4700
},
{
"epoch": 1.848487821629179,
"grad_norm": 2.1748995033926497,
"learning_rate": 6.782763303118194e-06,
"loss": 0.8464,
"step": 4710
},
{
"epoch": 1.8524123721455026,
"grad_norm": 2.0279495030924046,
"learning_rate": 6.742671379790756e-06,
"loss": 0.8782,
"step": 4720
},
{
"epoch": 1.8563369226618263,
"grad_norm": 1.6914708882579899,
"learning_rate": 6.702637942946441e-06,
"loss": 0.8422,
"step": 4730
},
{
"epoch": 1.86026147317815,
"grad_norm": 1.3001032946364874,
"learning_rate": 6.662663711399705e-06,
"loss": 0.8189,
"step": 4740
},
{
"epoch": 1.8641860236944736,
"grad_norm": 0.9695568949982307,
"learning_rate": 6.622749402901971e-06,
"loss": 0.7972,
"step": 4750
},
{
"epoch": 1.8681105742107973,
"grad_norm": 2.3580357832714696,
"learning_rate": 6.5828957341287025e-06,
"loss": 0.8602,
"step": 4760
},
{
"epoch": 1.872035124727121,
"grad_norm": 2.053651822379713,
"learning_rate": 6.5431034206665686e-06,
"loss": 0.8946,
"step": 4770
},
{
"epoch": 1.8759596752434446,
"grad_norm": 1.6781331457753144,
"learning_rate": 6.503373177000582e-06,
"loss": 0.8479,
"step": 4780
},
{
"epoch": 1.8798842257597683,
"grad_norm": 1.3439461982630885,
"learning_rate": 6.463705716501261e-06,
"loss": 0.8108,
"step": 4790
},
{
"epoch": 1.883808776276092,
"grad_norm": 0.8215956260267698,
"learning_rate": 6.424101751411842e-06,
"loss": 0.8124,
"step": 4800
},
{
"epoch": 1.8877333267924157,
"grad_norm": 2.2909082972168275,
"learning_rate": 6.3845619928354676e-06,
"loss": 0.8253,
"step": 4810
},
{
"epoch": 1.8916578773087394,
"grad_norm": 2.1043172809575057,
"learning_rate": 6.345087150722441e-06,
"loss": 0.8767,
"step": 4820
},
{
"epoch": 1.895582427825063,
"grad_norm": 1.7239356576641465,
"learning_rate": 6.305677933857455e-06,
"loss": 0.8217,
"step": 4830
},
{
"epoch": 1.8995069783413867,
"grad_norm": 1.4857325973939928,
"learning_rate": 6.266335049846886e-06,
"loss": 0.8415,
"step": 4840
},
{
"epoch": 1.9034315288577104,
"grad_norm": 0.9334678918114299,
"learning_rate": 6.227059205106085e-06,
"loss": 0.7717,
"step": 4850
},
{
"epoch": 1.907356079374034,
"grad_norm": 2.2849232645875297,
"learning_rate": 6.187851104846676e-06,
"loss": 0.846,
"step": 4860
},
{
"epoch": 1.9112806298903577,
"grad_norm": 1.945016141615289,
"learning_rate": 6.1487114530639205e-06,
"loss": 0.8882,
"step": 4870
},
{
"epoch": 1.9152051804066814,
"grad_norm": 1.7795684379972176,
"learning_rate": 6.109640952524052e-06,
"loss": 0.8329,
"step": 4880
},
{
"epoch": 1.919129730923005,
"grad_norm": 1.2407252843374668,
"learning_rate": 6.070640304751677e-06,
"loss": 0.8251,
"step": 4890
},
{
"epoch": 1.9230542814393288,
"grad_norm": 0.8194399568120128,
"learning_rate": 6.031710210017171e-06,
"loss": 0.7867,
"step": 4900
},
{
"epoch": 1.9269788319556524,
"grad_norm": 2.3956208299915955,
"learning_rate": 5.992851367324097e-06,
"loss": 0.8433,
"step": 4910
},
{
"epoch": 1.9309033824719761,
"grad_norm": 2.0827483565619915,
"learning_rate": 5.954064474396675e-06,
"loss": 0.885,
"step": 4920
},
{
"epoch": 1.9348279329882998,
"grad_norm": 1.70357287888689,
"learning_rate": 5.915350227667225e-06,
"loss": 0.8385,
"step": 4930
},
{
"epoch": 1.9387524835046235,
"grad_norm": 1.2786111455423548,
"learning_rate": 5.876709322263696e-06,
"loss": 0.8207,
"step": 4940
},
{
"epoch": 1.9426770340209472,
"grad_norm": 0.7671237740151083,
"learning_rate": 5.838142451997155e-06,
"loss": 0.8048,
"step": 4950
},
{
"epoch": 1.9466015845372708,
"grad_norm": 2.3292976163560546,
"learning_rate": 5.799650309349348e-06,
"loss": 0.8462,
"step": 4960
},
{
"epoch": 1.9505261350535945,
"grad_norm": 2.077691017439424,
"learning_rate": 5.761233585460265e-06,
"loss": 0.9123,
"step": 4970
},
{
"epoch": 1.9544506855699182,
"grad_norm": 1.6563795333206879,
"learning_rate": 5.722892970115712e-06,
"loss": 0.8154,
"step": 4980
},
{
"epoch": 1.9583752360862419,
"grad_norm": 1.3593453278887475,
"learning_rate": 5.684629151734949e-06,
"loss": 0.8108,
"step": 4990
},
{
"epoch": 1.9622997866025655,
"grad_norm": 0.8477008150249254,
"learning_rate": 5.6464428173583174e-06,
"loss": 0.7722,
"step": 5000
},
{
"epoch": 1.9662243371188892,
"grad_norm": 2.3204148587243103,
"learning_rate": 5.608334652634914e-06,
"loss": 0.8387,
"step": 5010
},
{
"epoch": 1.970148887635213,
"grad_norm": 2.08224549041347,
"learning_rate": 5.570305341810252e-06,
"loss": 0.8696,
"step": 5020
},
{
"epoch": 1.9740734381515366,
"grad_norm": 1.6687767550667691,
"learning_rate": 5.532355567714013e-06,
"loss": 0.8536,
"step": 5030
},
{
"epoch": 1.9779979886678603,
"grad_norm": 1.2616375376723235,
"learning_rate": 5.494486011747761e-06,
"loss": 0.8102,
"step": 5040
},
{
"epoch": 1.981922539184184,
"grad_norm": 0.7574755635671572,
"learning_rate": 5.4566973538727216e-06,
"loss": 0.7677,
"step": 5050
},
{
"epoch": 1.9858470897005076,
"grad_norm": 2.309973402310073,
"learning_rate": 5.418990272597561e-06,
"loss": 0.839,
"step": 5060
},
{
"epoch": 1.9897716402168313,
"grad_norm": 2.196528214038538,
"learning_rate": 5.381365444966205e-06,
"loss": 0.8893,
"step": 5070
},
{
"epoch": 1.993696190733155,
"grad_norm": 1.6515133028295628,
"learning_rate": 5.3438235465456926e-06,
"loss": 0.8053,
"step": 5080
},
{
"epoch": 1.9976207412494786,
"grad_norm": 1.4943879150775243,
"learning_rate": 5.306365251414043e-06,
"loss": 0.798,
"step": 5090
},
{
"epoch": 1.999975471559273,
"eval_loss": 0.6450071930885315,
"eval_runtime": 1529.4328,
"eval_samples_per_second": 16.346,
"eval_steps_per_second": 4.086,
"step": 5096
},
{
"epoch": 2.0015698202065293,
"grad_norm": 1.4784398750719419,
"learning_rate": 5.268991232148137e-06,
"loss": 0.8525,
"step": 5100
},
{
"epoch": 2.005494370722853,
"grad_norm": 9.338133108180491,
"learning_rate": 5.2317021598116635e-06,
"loss": 0.5966,
"step": 5110
},
{
"epoch": 2.0094189212391766,
"grad_norm": 1.8570667185487677,
"learning_rate": 5.1944987039430535e-06,
"loss": 0.6489,
"step": 5120
},
{
"epoch": 2.0133434717555003,
"grad_norm": 2.1202392786052697,
"learning_rate": 5.157381532543473e-06,
"loss": 0.6784,
"step": 5130
},
{
"epoch": 2.017268022271824,
"grad_norm": 1.700823660700944,
"learning_rate": 5.120351312064802e-06,
"loss": 0.6222,
"step": 5140
},
{
"epoch": 2.0211925727881477,
"grad_norm": 1.3821129005652262,
"learning_rate": 5.083408707397704e-06,
"loss": 0.6147,
"step": 5150
},
{
"epoch": 2.0251171233044714,
"grad_norm": 1.0951660096886453,
"learning_rate": 5.046554381859663e-06,
"loss": 0.5845,
"step": 5160
},
{
"epoch": 2.029041673820795,
"grad_norm": 2.138790453625462,
"learning_rate": 5.009788997183074e-06,
"loss": 0.6237,
"step": 5170
},
{
"epoch": 2.0329662243371187,
"grad_norm": 2.0579134568589597,
"learning_rate": 4.973113213503379e-06,
"loss": 0.705,
"step": 5180
},
{
"epoch": 2.0368907748534424,
"grad_norm": 1.75667330374861,
"learning_rate": 4.936527689347195e-06,
"loss": 0.6389,
"step": 5190
},
{
"epoch": 2.040815325369766,
"grad_norm": 1.308594348694505,
"learning_rate": 4.9000330816205e-06,
"loss": 0.6035,
"step": 5200
},
{
"epoch": 2.0447398758860897,
"grad_norm": 0.9410941640978612,
"learning_rate": 4.863630045596838e-06,
"loss": 0.5541,
"step": 5210
},
{
"epoch": 2.0486644264024134,
"grad_norm": 2.1522254372307357,
"learning_rate": 4.8273192349055405e-06,
"loss": 0.5952,
"step": 5220
},
{
"epoch": 2.052588976918737,
"grad_norm": 2.036592335343225,
"learning_rate": 4.791101301520016e-06,
"loss": 0.6809,
"step": 5230
},
{
"epoch": 2.0565135274350608,
"grad_norm": 1.686946791293678,
"learning_rate": 4.754976895746007e-06,
"loss": 0.6342,
"step": 5240
},
{
"epoch": 2.0604380779513845,
"grad_norm": 1.3407168000815266,
"learning_rate": 4.718946666209966e-06,
"loss": 0.6237,
"step": 5250
},
{
"epoch": 2.064362628467708,
"grad_norm": 0.8377432352577971,
"learning_rate": 4.683011259847346e-06,
"loss": 0.5427,
"step": 5260
},
{
"epoch": 2.068287178984032,
"grad_norm": 2.1455140854622496,
"learning_rate": 4.647171321891034e-06,
"loss": 0.6384,
"step": 5270
},
{
"epoch": 2.0722117295003555,
"grad_norm": 1.99774007410406,
"learning_rate": 4.61142749585975e-06,
"loss": 0.667,
"step": 5280
},
{
"epoch": 2.076136280016679,
"grad_norm": 1.7474636373639787,
"learning_rate": 4.575780423546476e-06,
"loss": 0.6309,
"step": 5290
},
{
"epoch": 2.080060830533003,
"grad_norm": 1.3359269678333927,
"learning_rate": 4.540230745006962e-06,
"loss": 0.5829,
"step": 5300
},
{
"epoch": 2.0839853810493265,
"grad_norm": 0.7162710672069984,
"learning_rate": 4.504779098548209e-06,
"loss": 0.5332,
"step": 5310
},
{
"epoch": 2.08790993156565,
"grad_norm": 2.0989757061484893,
"learning_rate": 4.469426120717025e-06,
"loss": 0.624,
"step": 5320
},
{
"epoch": 2.091834482081974,
"grad_norm": 2.1426189322836975,
"learning_rate": 4.434172446288579e-06,
"loss": 0.6681,
"step": 5330
},
{
"epoch": 2.0957590325982975,
"grad_norm": 1.776402952977803,
"learning_rate": 4.399018708255018e-06,
"loss": 0.6193,
"step": 5340
},
{
"epoch": 2.099683583114621,
"grad_norm": 1.3008240753278815,
"learning_rate": 4.363965537814102e-06,
"loss": 0.6082,
"step": 5350
},
{
"epoch": 2.103608133630945,
"grad_norm": 0.8996256226429636,
"learning_rate": 4.329013564357848e-06,
"loss": 0.5629,
"step": 5360
},
{
"epoch": 2.1075326841472686,
"grad_norm": 2.0984085235237004,
"learning_rate": 4.294163415461258e-06,
"loss": 0.6169,
"step": 5370
},
{
"epoch": 2.1114572346635923,
"grad_norm": 2.1055348326355667,
"learning_rate": 4.259415716871037e-06,
"loss": 0.6725,
"step": 5380
},
{
"epoch": 2.115381785179916,
"grad_norm": 1.7564083013798226,
"learning_rate": 4.224771092494355e-06,
"loss": 0.6177,
"step": 5390
},
{
"epoch": 2.1193063356962396,
"grad_norm": 1.3081730861592322,
"learning_rate": 4.1902301643876555e-06,
"loss": 0.5994,
"step": 5400
},
{
"epoch": 2.1232308862125633,
"grad_norm": 0.7914811793114824,
"learning_rate": 4.155793552745465e-06,
"loss": 0.5642,
"step": 5410
},
{
"epoch": 2.127155436728887,
"grad_norm": 2.4378415532004287,
"learning_rate": 4.1214618758892865e-06,
"loss": 0.6125,
"step": 5420
},
{
"epoch": 2.1310799872452106,
"grad_norm": 2.0526253778498154,
"learning_rate": 4.087235750256469e-06,
"loss": 0.666,
"step": 5430
},
{
"epoch": 2.1350045377615343,
"grad_norm": 1.8656396637088302,
"learning_rate": 4.053115790389159e-06,
"loss": 0.6394,
"step": 5440
},
{
"epoch": 2.138929088277858,
"grad_norm": 1.209266875293654,
"learning_rate": 4.019102608923262e-06,
"loss": 0.6132,
"step": 5450
},
{
"epoch": 2.1428536387941817,
"grad_norm": 1.1832551572175067,
"learning_rate": 3.985196816577433e-06,
"loss": 0.5475,
"step": 5460
},
{
"epoch": 2.1467781893105053,
"grad_norm": 2.2143801889042276,
"learning_rate": 3.951399022142127e-06,
"loss": 0.608,
"step": 5470
},
{
"epoch": 2.150702739826829,
"grad_norm": 2.128346152883008,
"learning_rate": 3.917709832468641e-06,
"loss": 0.6848,
"step": 5480
},
{
"epoch": 2.1546272903431527,
"grad_norm": 1.6896984103464467,
"learning_rate": 3.884129852458253e-06,
"loss": 0.6284,
"step": 5490
},
{
"epoch": 2.1585518408594764,
"grad_norm": 1.222694494025062,
"learning_rate": 3.850659685051336e-06,
"loss": 0.5898,
"step": 5500
},
{
"epoch": 2.1624763913758,
"grad_norm": 0.8066312626727323,
"learning_rate": 3.817299931216537e-06,
"loss": 0.546,
"step": 5510
},
{
"epoch": 2.1664009418921237,
"grad_norm": 2.05281043982966,
"learning_rate": 3.784051189939996e-06,
"loss": 0.6217,
"step": 5520
},
{
"epoch": 2.1703254924084474,
"grad_norm": 2.0329973993617556,
"learning_rate": 3.7509140582145707e-06,
"loss": 0.6679,
"step": 5530
},
{
"epoch": 2.174250042924771,
"grad_norm": 1.7930592907009166,
"learning_rate": 3.7178891310291444e-06,
"loss": 0.6302,
"step": 5540
},
{
"epoch": 2.1781745934410948,
"grad_norm": 1.3304481186905044,
"learning_rate": 3.6849770013579135e-06,
"loss": 0.5972,
"step": 5550
},
{
"epoch": 2.1820991439574184,
"grad_norm": 0.8862316368226267,
"learning_rate": 3.652178260149768e-06,
"loss": 0.5508,
"step": 5560
},
{
"epoch": 2.186023694473742,
"grad_norm": 2.16438233599458,
"learning_rate": 3.619493496317662e-06,
"loss": 0.6113,
"step": 5570
},
{
"epoch": 2.189948244990066,
"grad_norm": 2.097362156115183,
"learning_rate": 3.5869232967280466e-06,
"loss": 0.678,
"step": 5580
},
{
"epoch": 2.1938727955063895,
"grad_norm": 1.7293983928389607,
"learning_rate": 3.554468246190337e-06,
"loss": 0.6255,
"step": 5590
},
{
"epoch": 2.197797346022713,
"grad_norm": 1.2526910240559423,
"learning_rate": 3.522128927446392e-06,
"loss": 0.6191,
"step": 5600
},
{
"epoch": 2.201721896539037,
"grad_norm": 0.779154938656075,
"learning_rate": 3.489905921160083e-06,
"loss": 0.5403,
"step": 5610
},
{
"epoch": 2.2056464470553605,
"grad_norm": 2.0546436977432094,
"learning_rate": 3.4577998059068354e-06,
"loss": 0.6159,
"step": 5620
},
{
"epoch": 2.209570997571684,
"grad_norm": 2.141525722727545,
"learning_rate": 3.4258111581632634e-06,
"loss": 0.6876,
"step": 5630
},
{
"epoch": 2.213495548088008,
"grad_norm": 1.7486151559652525,
"learning_rate": 3.3939405522968105e-06,
"loss": 0.6232,
"step": 5640
},
{
"epoch": 2.2174200986043315,
"grad_norm": 1.2463505933135222,
"learning_rate": 3.362188560555434e-06,
"loss": 0.603,
"step": 5650
},
{
"epoch": 2.221344649120655,
"grad_norm": 0.7557156986288721,
"learning_rate": 3.3305557530573363e-06,
"loss": 0.5734,
"step": 5660
},
{
"epoch": 2.225269199636979,
"grad_norm": 2.6186065252503994,
"learning_rate": 3.2990426977807156e-06,
"loss": 0.6169,
"step": 5670
},
{
"epoch": 2.2291937501533026,
"grad_norm": 2.1713884725106314,
"learning_rate": 3.2676499605535918e-06,
"loss": 0.6557,
"step": 5680
},
{
"epoch": 2.2331183006696262,
"grad_norm": 1.8496259928018195,
"learning_rate": 3.2363781050436105e-06,
"loss": 0.6224,
"step": 5690
},
{
"epoch": 2.23704285118595,
"grad_norm": 1.3375119127996462,
"learning_rate": 3.2052276927479677e-06,
"loss": 0.6029,
"step": 5700
},
{
"epoch": 2.2409674017022736,
"grad_norm": 0.847985655236947,
"learning_rate": 3.1741992829832924e-06,
"loss": 0.5552,
"step": 5710
},
{
"epoch": 2.2448919522185973,
"grad_norm": 2.1771913040167212,
"learning_rate": 3.143293432875607e-06,
"loss": 0.6089,
"step": 5720
},
{
"epoch": 2.248816502734921,
"grad_norm": 2.0426822342569837,
"learning_rate": 3.112510697350348e-06,
"loss": 0.6927,
"step": 5730
},
{
"epoch": 2.2527410532512446,
"grad_norm": 1.8247017082924184,
"learning_rate": 3.081851629122372e-06,
"loss": 0.6389,
"step": 5740
},
{
"epoch": 2.2566656037675683,
"grad_norm": 1.3403447493161933,
"learning_rate": 3.051316778686055e-06,
"loss": 0.5947,
"step": 5750
},
{
"epoch": 2.260590154283892,
"grad_norm": 0.8089857773862223,
"learning_rate": 3.0209066943053944e-06,
"loss": 0.5622,
"step": 5760
},
{
"epoch": 2.2645147048002157,
"grad_norm": 2.3577656985489477,
"learning_rate": 2.990621922004172e-06,
"loss": 0.5892,
"step": 5770
},
{
"epoch": 2.2684392553165393,
"grad_norm": 2.1740666869738323,
"learning_rate": 2.960463005556149e-06,
"loss": 0.672,
"step": 5780
},
{
"epoch": 2.272363805832863,
"grad_norm": 1.829981750443929,
"learning_rate": 2.9304304864752886e-06,
"loss": 0.6373,
"step": 5790
},
{
"epoch": 2.2762883563491867,
"grad_norm": 1.3281957095626744,
"learning_rate": 2.900524904006061e-06,
"loss": 0.5975,
"step": 5800
},
{
"epoch": 2.2802129068655104,
"grad_norm": 0.7087417039119808,
"learning_rate": 2.87074679511373e-06,
"loss": 0.5296,
"step": 5810
},
{
"epoch": 2.284137457381834,
"grad_norm": 2.109191274545063,
"learning_rate": 2.8410966944747377e-06,
"loss": 0.5962,
"step": 5820
},
{
"epoch": 2.2880620078981577,
"grad_norm": 2.241584819214679,
"learning_rate": 2.8115751344670863e-06,
"loss": 0.6636,
"step": 5830
},
{
"epoch": 2.2919865584144814,
"grad_norm": 1.7605810701006008,
"learning_rate": 2.782182645160789e-06,
"loss": 0.6265,
"step": 5840
},
{
"epoch": 2.295911108930805,
"grad_norm": 1.2836328256236162,
"learning_rate": 2.7529197543083507e-06,
"loss": 0.5931,
"step": 5850
},
{
"epoch": 2.2998356594471288,
"grad_norm": 0.9519727219083821,
"learning_rate": 2.7237869873352827e-06,
"loss": 0.5509,
"step": 5860
},
{
"epoch": 2.3037602099634524,
"grad_norm": 2.1895645275891704,
"learning_rate": 2.6947848673306853e-06,
"loss": 0.6199,
"step": 5870
},
{
"epoch": 2.307684760479776,
"grad_norm": 2.0598817109009904,
"learning_rate": 2.6659139150378377e-06,
"loss": 0.6591,
"step": 5880
},
{
"epoch": 2.3116093109961,
"grad_norm": 1.8143316778974414,
"learning_rate": 2.6371746488448614e-06,
"loss": 0.6347,
"step": 5890
},
{
"epoch": 2.3155338615124235,
"grad_norm": 1.3541045070964877,
"learning_rate": 2.6085675847754155e-06,
"loss": 0.586,
"step": 5900
},
{
"epoch": 2.319458412028747,
"grad_norm": 0.7923721169078987,
"learning_rate": 2.5800932364794064e-06,
"loss": 0.5212,
"step": 5910
},
{
"epoch": 2.323382962545071,
"grad_norm": 2.794475468134139,
"learning_rate": 2.5517521152237966e-06,
"loss": 0.5974,
"step": 5920
},
{
"epoch": 2.3273075130613945,
"grad_norm": 2.2143050890712255,
"learning_rate": 2.5235447298834003e-06,
"loss": 0.6684,
"step": 5930
},
{
"epoch": 2.331232063577718,
"grad_norm": 1.7900709742899215,
"learning_rate": 2.49547158693176e-06,
"loss": 0.6278,
"step": 5940
},
{
"epoch": 2.335156614094042,
"grad_norm": 1.271494366484403,
"learning_rate": 2.4675331904320533e-06,
"loss": 0.5929,
"step": 5950
},
{
"epoch": 2.3390811646103655,
"grad_norm": 0.8556427431737861,
"learning_rate": 2.43973004202803e-06,
"loss": 0.5524,
"step": 5960
},
{
"epoch": 2.343005715126689,
"grad_norm": 2.117180082843371,
"learning_rate": 2.412062640935021e-06,
"loss": 0.6013,
"step": 5970
},
{
"epoch": 2.346930265643013,
"grad_norm": 2.1484003642018457,
"learning_rate": 2.3845314839309563e-06,
"loss": 0.6632,
"step": 5980
},
{
"epoch": 2.3508548161593366,
"grad_norm": 1.8030873162593484,
"learning_rate": 2.3571370653474656e-06,
"loss": 0.6168,
"step": 5990
},
{
"epoch": 2.3547793666756602,
"grad_norm": 1.2601737288351715,
"learning_rate": 2.329879877060981e-06,
"loss": 0.5886,
"step": 6000
},
{
"epoch": 2.358703917191984,
"grad_norm": 0.9165582307879456,
"learning_rate": 2.302760408483926e-06,
"loss": 0.5428,
"step": 6010
},
{
"epoch": 2.3626284677083076,
"grad_norm": 2.0984461367902605,
"learning_rate": 2.275779146555915e-06,
"loss": 0.6007,
"step": 6020
},
{
"epoch": 2.3665530182246313,
"grad_norm": 2.1846524601461894,
"learning_rate": 2.2489365757350132e-06,
"loss": 0.664,
"step": 6030
},
{
"epoch": 2.370477568740955,
"grad_norm": 1.7647491805175937,
"learning_rate": 2.2222331779890393e-06,
"loss": 0.6257,
"step": 6040
},
{
"epoch": 2.3744021192572786,
"grad_norm": 1.3444624902761966,
"learning_rate": 2.1956694327869043e-06,
"loss": 0.6041,
"step": 6050
},
{
"epoch": 2.3783266697736023,
"grad_norm": 0.8924579097538136,
"learning_rate": 2.16924581709002e-06,
"loss": 0.5369,
"step": 6060
},
{
"epoch": 2.382251220289926,
"grad_norm": 2.1816842093278526,
"learning_rate": 2.142962805343708e-06,
"loss": 0.5806,
"step": 6070
},
{
"epoch": 2.3861757708062497,
"grad_norm": 2.1315338386325138,
"learning_rate": 2.1168208694687108e-06,
"loss": 0.6934,
"step": 6080
},
{
"epoch": 2.3901003213225733,
"grad_norm": 1.8401419512172745,
"learning_rate": 2.0908204788526965e-06,
"loss": 0.6473,
"step": 6090
},
{
"epoch": 2.394024871838897,
"grad_norm": 1.256508614588367,
"learning_rate": 2.064962100341842e-06,
"loss": 0.6,
"step": 6100
},
{
"epoch": 2.3979494223552207,
"grad_norm": 0.7725755529070791,
"learning_rate": 2.039246198232446e-06,
"loss": 0.5488,
"step": 6110
},
{
"epoch": 2.4018739728715444,
"grad_norm": 2.1092659651397474,
"learning_rate": 2.0136732342625874e-06,
"loss": 0.5748,
"step": 6120
},
{
"epoch": 2.405798523387868,
"grad_norm": 2.202730059861675,
"learning_rate": 1.9882436676038477e-06,
"loss": 0.6778,
"step": 6130
},
{
"epoch": 2.4097230739041917,
"grad_norm": 1.7146488474319233,
"learning_rate": 1.962957954853055e-06,
"loss": 0.642,
"step": 6140
},
{
"epoch": 2.4136476244205154,
"grad_norm": 1.2875382567695426,
"learning_rate": 1.9378165500240943e-06,
"loss": 0.5935,
"step": 6150
},
{
"epoch": 2.417572174936839,
"grad_norm": 0.8159718677862676,
"learning_rate": 1.912819904539749e-06,
"loss": 0.556,
"step": 6160
},
{
"epoch": 2.4214967254531627,
"grad_norm": 2.147420171464766,
"learning_rate": 1.887968467223591e-06,
"loss": 0.6084,
"step": 6170
},
{
"epoch": 2.4254212759694864,
"grad_norm": 2.352418026966586,
"learning_rate": 1.8632626842919398e-06,
"loss": 0.6647,
"step": 6180
},
{
"epoch": 2.42934582648581,
"grad_norm": 1.843956419568753,
"learning_rate": 1.8387029993458273e-06,
"loss": 0.6224,
"step": 6190
},
{
"epoch": 2.4332703770021338,
"grad_norm": 1.215534258708622,
"learning_rate": 1.8142898533630536e-06,
"loss": 0.6116,
"step": 6200
},
{
"epoch": 2.4371949275184575,
"grad_norm": 0.7790773570638417,
"learning_rate": 1.7900236846902575e-06,
"loss": 0.5395,
"step": 6210
},
{
"epoch": 2.441119478034781,
"grad_norm": 2.274459819686287,
"learning_rate": 1.765904929035046e-06,
"loss": 0.6089,
"step": 6220
},
{
"epoch": 2.445044028551105,
"grad_norm": 2.156030969614435,
"learning_rate": 1.7419340194581803e-06,
"loss": 0.6517,
"step": 6230
},
{
"epoch": 2.4489685790674285,
"grad_norm": 1.8023260247461752,
"learning_rate": 1.7181113863657805e-06,
"loss": 0.6312,
"step": 6240
},
{
"epoch": 2.452893129583752,
"grad_norm": 1.6128467278404588,
"learning_rate": 1.6944374575016253e-06,
"loss": 0.6097,
"step": 6250
},
{
"epoch": 2.456817680100076,
"grad_norm": 0.7816281842546718,
"learning_rate": 1.670912657939443e-06,
"loss": 0.5411,
"step": 6260
},
{
"epoch": 2.4607422306163995,
"grad_norm": 2.2598007072156028,
"learning_rate": 1.6475374100753017e-06,
"loss": 0.6139,
"step": 6270
},
{
"epoch": 2.464666781132723,
"grad_norm": 2.1326684182909936,
"learning_rate": 1.624312133620013e-06,
"loss": 0.6849,
"step": 6280
},
{
"epoch": 2.468591331649047,
"grad_norm": 1.793136525038962,
"learning_rate": 1.6012372455915993e-06,
"loss": 0.6165,
"step": 6290
},
{
"epoch": 2.4725158821653705,
"grad_norm": 1.2436395590561673,
"learning_rate": 1.5783131603078083e-06,
"loss": 0.5958,
"step": 6300
},
{
"epoch": 2.4764404326816942,
"grad_norm": 0.8178284156434679,
"learning_rate": 1.555540289378663e-06,
"loss": 0.542,
"step": 6310
},
{
"epoch": 2.480364983198018,
"grad_norm": 2.219348641209819,
"learning_rate": 1.532919041699089e-06,
"loss": 0.6146,
"step": 6320
},
{
"epoch": 2.4842895337143416,
"grad_norm": 2.1421139951368375,
"learning_rate": 1.510449823441561e-06,
"loss": 0.669,
"step": 6330
},
{
"epoch": 2.4882140842306653,
"grad_norm": 1.7736069610642504,
"learning_rate": 1.4881330380488014e-06,
"loss": 0.6325,
"step": 6340
},
{
"epoch": 2.492138634746989,
"grad_norm": 1.2940938084497826,
"learning_rate": 1.4659690862265675e-06,
"loss": 0.5918,
"step": 6350
},
{
"epoch": 2.4960631852633126,
"grad_norm": 0.7888355624944882,
"learning_rate": 1.4439583659364154e-06,
"loss": 0.5432,
"step": 6360
},
{
"epoch": 2.4999877357796363,
"grad_norm": 2.2684459134167003,
"learning_rate": 1.4221012723885874e-06,
"loss": 0.6068,
"step": 6370
},
{
"epoch": 2.50391228629596,
"grad_norm": 2.303631890847662,
"learning_rate": 1.400398198034897e-06,
"loss": 0.6815,
"step": 6380
},
{
"epoch": 2.5078368368122836,
"grad_norm": 1.725928594909339,
"learning_rate": 1.3788495325616912e-06,
"loss": 0.629,
"step": 6390
},
{
"epoch": 2.5117613873286073,
"grad_norm": 1.2724712580718072,
"learning_rate": 1.357455662882855e-06,
"loss": 0.5858,
"step": 6400
},
{
"epoch": 2.515685937844931,
"grad_norm": 0.8196671527768702,
"learning_rate": 1.3362169731328534e-06,
"loss": 0.543,
"step": 6410
},
{
"epoch": 2.5196104883612547,
"grad_norm": 2.2121857648933614,
"learning_rate": 1.3151338446598483e-06,
"loss": 0.5918,
"step": 6420
},
{
"epoch": 2.5235350388775784,
"grad_norm": 2.07329199306329,
"learning_rate": 1.2942066560188349e-06,
"loss": 0.65,
"step": 6430
},
{
"epoch": 2.527459589393902,
"grad_norm": 1.7691554410042842,
"learning_rate": 1.2734357829648624e-06,
"loss": 0.6245,
"step": 6440
},
{
"epoch": 2.5313841399102257,
"grad_norm": 1.2397356761301885,
"learning_rate": 1.2528215984462766e-06,
"loss": 0.5757,
"step": 6450
},
{
"epoch": 2.5353086904265494,
"grad_norm": 0.8552163717265764,
"learning_rate": 1.23236447259802e-06,
"loss": 0.5636,
"step": 6460
},
{
"epoch": 2.539233240942873,
"grad_norm": 2.019030277835733,
"learning_rate": 1.2120647727349977e-06,
"loss": 0.5962,
"step": 6470
},
{
"epoch": 2.5431577914591967,
"grad_norm": 2.1298373917453928,
"learning_rate": 1.1919228633454738e-06,
"loss": 0.6936,
"step": 6480
},
{
"epoch": 2.5470823419755204,
"grad_norm": 1.8011050211359714,
"learning_rate": 1.1719391060845298e-06,
"loss": 0.6272,
"step": 6490
},
{
"epoch": 2.551006892491844,
"grad_norm": 1.221657027890025,
"learning_rate": 1.152113859767565e-06,
"loss": 0.6286,
"step": 6500
},
{
"epoch": 2.5549314430081678,
"grad_norm": 0.8061467765642856,
"learning_rate": 1.1324474803638653e-06,
"loss": 0.5501,
"step": 6510
},
{
"epoch": 2.5588559935244914,
"grad_norm": 2.051549539190188,
"learning_rate": 1.1129403209902034e-06,
"loss": 0.6067,
"step": 6520
},
{
"epoch": 2.562780544040815,
"grad_norm": 2.3142399536514073,
"learning_rate": 1.0935927319044959e-06,
"loss": 0.6484,
"step": 6530
},
{
"epoch": 2.566705094557139,
"grad_norm": 1.7767173325015122,
"learning_rate": 1.0744050604995237e-06,
"loss": 0.6047,
"step": 6540
},
{
"epoch": 2.5706296450734625,
"grad_norm": 1.3099949970790892,
"learning_rate": 1.0553776512966886e-06,
"loss": 0.5826,
"step": 6550
},
{
"epoch": 2.574554195589786,
"grad_norm": 0.8416372830973359,
"learning_rate": 1.0365108459398277e-06,
"loss": 0.5418,
"step": 6560
},
{
"epoch": 2.57847874610611,
"grad_norm": 2.127337984518528,
"learning_rate": 1.0178049831890768e-06,
"loss": 0.6093,
"step": 6570
},
{
"epoch": 2.5824032966224335,
"grad_norm": 2.252528047531405,
"learning_rate": 9.992603989147941e-07,
"loss": 0.6867,
"step": 6580
},
{
"epoch": 2.586327847138757,
"grad_norm": 1.7132261228204282,
"learning_rate": 9.808774260915243e-07,
"loss": 0.6564,
"step": 6590
},
{
"epoch": 2.590252397655081,
"grad_norm": 1.2934365792962297,
"learning_rate": 9.626563947920231e-07,
"loss": 0.5691,
"step": 6600
},
{
"epoch": 2.5941769481714045,
"grad_norm": 0.8786324009918022,
"learning_rate": 9.445976321813277e-07,
"loss": 0.5383,
"step": 6610
},
{
"epoch": 2.598101498687728,
"grad_norm": 2.2501075917295212,
"learning_rate": 9.267014625108806e-07,
"loss": 0.5817,
"step": 6620
},
{
"epoch": 2.602026049204052,
"grad_norm": 2.2174253075426167,
"learning_rate": 9.089682071127171e-07,
"loss": 0.6744,
"step": 6630
},
{
"epoch": 2.6059505997203756,
"grad_norm": 1.7616659355193223,
"learning_rate": 8.91398184393687e-07,
"loss": 0.6359,
"step": 6640
},
{
"epoch": 2.6098751502366992,
"grad_norm": 1.3089314769046627,
"learning_rate": 8.739917098297357e-07,
"loss": 0.6045,
"step": 6650
},
{
"epoch": 2.613799700753023,
"grad_norm": 0.7444796805297569,
"learning_rate": 8.567490959602509e-07,
"loss": 0.5295,
"step": 6660
},
{
"epoch": 2.6177242512693466,
"grad_norm": 2.2630343377076287,
"learning_rate": 8.396706523824372e-07,
"loss": 0.6244,
"step": 6670
},
{
"epoch": 2.6216488017856703,
"grad_norm": 2.0443361827637836,
"learning_rate": 8.227566857457702e-07,
"loss": 0.6894,
"step": 6680
},
{
"epoch": 2.625573352301994,
"grad_norm": 1.7688362533725879,
"learning_rate": 8.060074997464773e-07,
"loss": 0.6192,
"step": 6690
},
{
"epoch": 2.6294979028183176,
"grad_norm": 1.2229083921293311,
"learning_rate": 7.894233951220953e-07,
"loss": 0.5856,
"step": 6700
},
{
"epoch": 2.6334224533346413,
"grad_norm": 0.76533431687556,
"learning_rate": 7.730046696460691e-07,
"loss": 0.53,
"step": 6710
},
{
"epoch": 2.637347003850965,
"grad_norm": 2.264231653926727,
"learning_rate": 7.567516181223966e-07,
"loss": 0.5991,
"step": 6720
},
{
"epoch": 2.6412715543672887,
"grad_norm": 2.1991521419444267,
"learning_rate": 7.406645323803463e-07,
"loss": 0.6315,
"step": 6730
},
{
"epoch": 2.6451961048836123,
"grad_norm": 1.7239735369036862,
"learning_rate": 7.247437012692104e-07,
"loss": 0.6427,
"step": 6740
},
{
"epoch": 2.649120655399936,
"grad_norm": 1.2790232035397695,
"learning_rate": 7.089894106531214e-07,
"loss": 0.594,
"step": 6750
},
{
"epoch": 2.6530452059162597,
"grad_norm": 0.955171760712246,
"learning_rate": 6.934019434059213e-07,
"loss": 0.5533,
"step": 6760
},
{
"epoch": 2.6569697564325834,
"grad_norm": 2.1955164062676644,
"learning_rate": 6.779815794060718e-07,
"loss": 0.5936,
"step": 6770
},
{
"epoch": 2.660894306948907,
"grad_norm": 2.2404548484277664,
"learning_rate": 6.627285955316476e-07,
"loss": 0.6513,
"step": 6780
},
{
"epoch": 2.6648188574652307,
"grad_norm": 1.8828883729369554,
"learning_rate": 6.476432656553411e-07,
"loss": 0.6286,
"step": 6790
},
{
"epoch": 2.6687434079815544,
"grad_norm": 1.2443150219704873,
"learning_rate": 6.327258606395736e-07,
"loss": 0.5939,
"step": 6800
},
{
"epoch": 2.672667958497878,
"grad_norm": 0.9014377738542545,
"learning_rate": 6.179766483316041e-07,
"loss": 0.5334,
"step": 6810
},
{
"epoch": 2.6765925090142018,
"grad_norm": 2.183823820105367,
"learning_rate": 6.03395893558737e-07,
"loss": 0.5913,
"step": 6820
},
{
"epoch": 2.6805170595305254,
"grad_norm": 2.1912679703089313,
"learning_rate": 5.889838581235641e-07,
"loss": 0.6719,
"step": 6830
},
{
"epoch": 2.684441610046849,
"grad_norm": 1.7923155245248856,
"learning_rate": 5.747408007992572e-07,
"loss": 0.6208,
"step": 6840
},
{
"epoch": 2.688366160563173,
"grad_norm": 1.2494561675837905,
"learning_rate": 5.606669773249296e-07,
"loss": 0.596,
"step": 6850
},
{
"epoch": 2.6922907110794965,
"grad_norm": 0.907833148614821,
"learning_rate": 5.467626404010407e-07,
"loss": 0.5372,
"step": 6860
},
{
"epoch": 2.69621526159582,
"grad_norm": 2.2934267800117856,
"learning_rate": 5.330280396848619e-07,
"loss": 0.609,
"step": 6870
},
{
"epoch": 2.700139812112144,
"grad_norm": 2.114541613337409,
"learning_rate": 5.194634217859851e-07,
"loss": 0.6611,
"step": 6880
},
{
"epoch": 2.7040643626284675,
"grad_norm": 1.754501104961409,
"learning_rate": 5.060690302619053e-07,
"loss": 0.6157,
"step": 6890
},
{
"epoch": 2.707988913144791,
"grad_norm": 1.364912899052101,
"learning_rate": 4.92845105613644e-07,
"loss": 0.5929,
"step": 6900
},
{
"epoch": 2.711913463661115,
"grad_norm": 0.7372286271827898,
"learning_rate": 4.797918852814254e-07,
"loss": 0.5314,
"step": 6910
},
{
"epoch": 2.7158380141774385,
"grad_norm": 2.073475468780587,
"learning_rate": 4.6690960364041973e-07,
"loss": 0.5976,
"step": 6920
},
{
"epoch": 2.719762564693762,
"grad_norm": 2.2358152929450963,
"learning_rate": 4.5419849199653364e-07,
"loss": 0.6623,
"step": 6930
},
{
"epoch": 2.723687115210086,
"grad_norm": 1.815422184822613,
"learning_rate": 4.416587785822568e-07,
"loss": 0.6073,
"step": 6940
},
{
"epoch": 2.7276116657264096,
"grad_norm": 1.350125241744432,
"learning_rate": 4.2929068855256275e-07,
"loss": 0.5984,
"step": 6950
},
{
"epoch": 2.7315362162427332,
"grad_norm": 0.7223399634960578,
"learning_rate": 4.170944439808622e-07,
"loss": 0.5491,
"step": 6960
},
{
"epoch": 2.735460766759057,
"grad_norm": 2.1836663497672455,
"learning_rate": 4.0507026385502747e-07,
"loss": 0.5941,
"step": 6970
},
{
"epoch": 2.7393853172753806,
"grad_norm": 2.154640259731243,
"learning_rate": 3.932183640734466e-07,
"loss": 0.6781,
"step": 6980
},
{
"epoch": 2.7433098677917043,
"grad_norm": 1.908154353492756,
"learning_rate": 3.8153895744115767e-07,
"loss": 0.6178,
"step": 6990
},
{
"epoch": 2.747234418308028,
"grad_norm": 1.2339098485081115,
"learning_rate": 3.700322536660228e-07,
"loss": 0.5819,
"step": 7000
},
{
"epoch": 2.7511589688243516,
"grad_norm": 0.9319742069385385,
"learning_rate": 3.586984593549614e-07,
"loss": 0.5296,
"step": 7010
},
{
"epoch": 2.7550835193406753,
"grad_norm": 2.2839758727554424,
"learning_rate": 3.475377780102451e-07,
"loss": 0.5919,
"step": 7020
},
{
"epoch": 2.759008069856999,
"grad_norm": 2.184552450870539,
"learning_rate": 3.365504100258399e-07,
"loss": 0.6341,
"step": 7030
},
{
"epoch": 2.7629326203733227,
"grad_norm": 1.7194768639357731,
"learning_rate": 3.2573655268380746e-07,
"loss": 0.6252,
"step": 7040
},
{
"epoch": 2.7668571708896463,
"grad_norm": 1.3255796782916087,
"learning_rate": 3.1509640015076946e-07,
"loss": 0.5879,
"step": 7050
},
{
"epoch": 2.77078172140597,
"grad_norm": 0.8088254429550722,
"learning_rate": 3.0463014347441255e-07,
"loss": 0.5519,
"step": 7060
},
{
"epoch": 2.7747062719222937,
"grad_norm": 2.202949542374798,
"learning_rate": 2.9433797058006195e-07,
"loss": 0.598,
"step": 7070
},
{
"epoch": 2.7786308224386174,
"grad_norm": 2.192327750139598,
"learning_rate": 2.842200662673111e-07,
"loss": 0.6815,
"step": 7080
},
{
"epoch": 2.782555372954941,
"grad_norm": 1.7866089200056756,
"learning_rate": 2.7427661220669535e-07,
"loss": 0.603,
"step": 7090
},
{
"epoch": 2.7864799234712647,
"grad_norm": 1.2795477884768611,
"learning_rate": 2.645077869364354e-07,
"loss": 0.5773,
"step": 7100
},
{
"epoch": 2.7904044739875884,
"grad_norm": 0.7183394998611272,
"learning_rate": 2.5491376585923265e-07,
"loss": 0.5313,
"step": 7110
},
{
"epoch": 2.794329024503912,
"grad_norm": 2.308178999869327,
"learning_rate": 2.4549472123911564e-07,
"loss": 0.5919,
"step": 7120
},
{
"epoch": 2.7982535750202358,
"grad_norm": 2.2719235337571813,
"learning_rate": 2.362508221983484e-07,
"loss": 0.6633,
"step": 7130
},
{
"epoch": 2.8021781255365594,
"grad_norm": 1.8358853033909601,
"learning_rate": 2.2718223471439815e-07,
"loss": 0.6084,
"step": 7140
},
{
"epoch": 2.806102676052883,
"grad_norm": 1.3561609455618762,
"learning_rate": 2.182891216169447e-07,
"loss": 0.5946,
"step": 7150
},
{
"epoch": 2.810027226569207,
"grad_norm": 0.801043920678816,
"learning_rate": 2.0957164258497031e-07,
"loss": 0.5394,
"step": 7160
},
{
"epoch": 2.8139517770855305,
"grad_norm": 2.3157120536637685,
"learning_rate": 2.0102995414387983e-07,
"loss": 0.5861,
"step": 7170
},
{
"epoch": 2.817876327601854,
"grad_norm": 2.0452627880092886,
"learning_rate": 1.9266420966270182e-07,
"loss": 0.6566,
"step": 7180
},
{
"epoch": 2.821800878118178,
"grad_norm": 1.883254183997997,
"learning_rate": 1.8447455935132418e-07,
"loss": 0.6011,
"step": 7190
},
{
"epoch": 2.8257254286345015,
"grad_norm": 1.277403091223367,
"learning_rate": 1.764611502578051e-07,
"loss": 0.573,
"step": 7200
},
{
"epoch": 2.829649979150825,
"grad_norm": 0.8701579907748704,
"learning_rate": 1.6862412626572845e-07,
"loss": 0.5748,
"step": 7210
},
{
"epoch": 2.833574529667149,
"grad_norm": 2.2395045060776115,
"learning_rate": 1.6096362809162047e-07,
"loss": 0.5897,
"step": 7220
},
{
"epoch": 2.8374990801834725,
"grad_norm": 2.1921791669212864,
"learning_rate": 1.5347979328242613e-07,
"loss": 0.6472,
"step": 7230
},
{
"epoch": 2.841423630699796,
"grad_norm": 1.7683013517405388,
"learning_rate": 1.461727562130344e-07,
"loss": 0.6176,
"step": 7240
},
{
"epoch": 2.84534818121612,
"grad_norm": 1.2599894592930116,
"learning_rate": 1.3904264808387246e-07,
"loss": 0.583,
"step": 7250
},
{
"epoch": 2.8492727317324436,
"grad_norm": 0.8602212034932214,
"learning_rate": 1.320895969185454e-07,
"loss": 0.5383,
"step": 7260
},
{
"epoch": 2.8531972822487672,
"grad_norm": 2.01097097204698,
"learning_rate": 1.2531372756153458e-07,
"loss": 0.5882,
"step": 7270
},
{
"epoch": 2.857121832765091,
"grad_norm": 2.29115443827529,
"learning_rate": 1.1871516167596186e-07,
"loss": 0.659,
"step": 7280
},
{
"epoch": 2.8610463832814146,
"grad_norm": 1.798405094313227,
"learning_rate": 1.1229401774140447e-07,
"loss": 0.6425,
"step": 7290
},
{
"epoch": 2.8649709337977383,
"grad_norm": 1.2675180226420815,
"learning_rate": 1.0605041105176128e-07,
"loss": 0.5757,
"step": 7300
},
{
"epoch": 2.868895484314062,
"grad_norm": 0.778167866423488,
"learning_rate": 9.998445371319332e-08,
"loss": 0.5289,
"step": 7310
},
{
"epoch": 2.8728200348303856,
"grad_norm": 2.1530650160440845,
"learning_rate": 9.409625464210093e-08,
"loss": 0.582,
"step": 7320
},
{
"epoch": 2.8767445853467093,
"grad_norm": 2.039982400238999,
"learning_rate": 8.83859195631731e-08,
"loss": 0.6649,
"step": 7330
},
{
"epoch": 2.880669135863033,
"grad_norm": 1.8192255183888113,
"learning_rate": 8.285355100748904e-08,
"loss": 0.6083,
"step": 7340
},
{
"epoch": 2.8845936863793566,
"grad_norm": 1.3511200742987195,
"learning_rate": 7.749924831067401e-08,
"loss": 0.5947,
"step": 7350
},
{
"epoch": 2.8885182368956803,
"grad_norm": 0.8078794663885482,
"learning_rate": 7.232310761112082e-08,
"loss": 0.5189,
"step": 7360
},
{
"epoch": 2.892442787412004,
"grad_norm": 2.086447914981744,
"learning_rate": 6.732522184825896e-08,
"loss": 0.5929,
"step": 7370
},
{
"epoch": 2.8963673379283277,
"grad_norm": 2.2301352619297097,
"learning_rate": 6.250568076088814e-08,
"loss": 0.6598,
"step": 7380
},
{
"epoch": 2.9002918884446514,
"grad_norm": 1.8389039532268878,
"learning_rate": 5.7864570885567405e-08,
"loss": 0.6154,
"step": 7390
},
{
"epoch": 2.904216438960975,
"grad_norm": 1.2792199660600578,
"learning_rate": 5.340197555505966e-08,
"loss": 0.6012,
"step": 7400
},
{
"epoch": 2.9081409894772987,
"grad_norm": 0.7325366163854787,
"learning_rate": 4.911797489683734e-08,
"loss": 0.5215,
"step": 7410
},
{
"epoch": 2.9120655399936224,
"grad_norm": 1.8968115785034971,
"learning_rate": 4.5012645831640225e-08,
"loss": 0.5973,
"step": 7420
},
{
"epoch": 2.915990090509946,
"grad_norm": 2.115373675723123,
"learning_rate": 4.108606207209875e-08,
"loss": 0.6792,
"step": 7430
},
{
"epoch": 2.9199146410262697,
"grad_norm": 1.8029011243698139,
"learning_rate": 3.7338294121407324e-08,
"loss": 0.62,
"step": 7440
},
{
"epoch": 2.9238391915425934,
"grad_norm": 1.288742701315245,
"learning_rate": 3.376940927206196e-08,
"loss": 0.5808,
"step": 7450
},
{
"epoch": 2.927763742058917,
"grad_norm": 0.798534719587082,
"learning_rate": 3.037947160464572e-08,
"loss": 0.5186,
"step": 7460
},
{
"epoch": 2.9316882925752408,
"grad_norm": 2.5141632319612954,
"learning_rate": 2.716854198668517e-08,
"loss": 0.577,
"step": 7470
},
{
"epoch": 2.9356128430915645,
"grad_norm": 2.35827100419327,
"learning_rate": 2.41366780715524e-08,
"loss": 0.6523,
"step": 7480
},
{
"epoch": 2.939537393607888,
"grad_norm": 1.8102877923296363,
"learning_rate": 2.1283934297432472e-08,
"loss": 0.6021,
"step": 7490
},
{
"epoch": 2.943461944124212,
"grad_norm": 1.3570094551959164,
"learning_rate": 1.861036188634424e-08,
"loss": 0.5859,
"step": 7500
},
{
"epoch": 2.9473864946405355,
"grad_norm": 0.8112341118577404,
"learning_rate": 1.6116008843224395e-08,
"loss": 0.5446,
"step": 7510
},
{
"epoch": 2.951311045156859,
"grad_norm": 2.0316666239859726,
"learning_rate": 1.3800919955058167e-08,
"loss": 0.5938,
"step": 7520
},
{
"epoch": 2.955235595673183,
"grad_norm": 2.24783418646791,
"learning_rate": 1.1665136790084408e-08,
"loss": 0.6663,
"step": 7530
},
{
"epoch": 2.9591601461895065,
"grad_norm": 1.7761857680646513,
"learning_rate": 9.708697697040636e-09,
"loss": 0.6199,
"step": 7540
},
{
"epoch": 2.96308469670583,
"grad_norm": 1.2633693500353296,
"learning_rate": 7.931637804481362e-09,
"loss": 0.5774,
"step": 7550
},
{
"epoch": 2.967009247222154,
"grad_norm": 0.8065989220041421,
"learning_rate": 6.333989020143039e-09,
"loss": 0.5492,
"step": 7560
},
{
"epoch": 2.9709337977384775,
"grad_norm": 2.3130844134472537,
"learning_rate": 4.915780030372297e-09,
"loss": 0.5792,
"step": 7570
},
{
"epoch": 2.974858348254801,
"grad_norm": 2.1600268864719765,
"learning_rate": 3.6770362996108033e-09,
"loss": 0.6643,
"step": 7580
},
{
"epoch": 2.978782898771125,
"grad_norm": 1.78858714847675,
"learning_rate": 2.617780069940068e-09,
"loss": 0.6227,
"step": 7590
},
{
"epoch": 2.9827074492874486,
"grad_norm": 1.2711261056787222,
"learning_rate": 1.738030360677323e-09,
"loss": 0.5816,
"step": 7600
},
{
"epoch": 2.9866319998037723,
"grad_norm": 0.8400903589420956,
"learning_rate": 1.0378029680391254e-09,
"loss": 0.5403,
"step": 7610
},
{
"epoch": 2.990556550320096,
"grad_norm": 2.242115052909609,
"learning_rate": 5.171104648549196e-10,
"loss": 0.6179,
"step": 7620
},
{
"epoch": 2.9944811008364196,
"grad_norm": 2.174676570322404,
"learning_rate": 1.759622003427719e-10,
"loss": 0.658,
"step": 7630
},
{
"epoch": 2.9984056513527433,
"grad_norm": 1.9961793276061766,
"learning_rate": 1.436429993950661e-11,
"loss": 0.5866,
"step": 7640
},
{
"epoch": 2.999975471559273,
"eval_loss": 0.5420118570327759,
"eval_runtime": 1735.1416,
"eval_samples_per_second": 14.408,
"eval_steps_per_second": 3.602,
"step": 7644
},
{
"epoch": 2.999975471559273,
"step": 7644,
"total_flos": 1249712503734272.0,
"train_loss": 0.20236909012093338,
"train_runtime": 73063.4732,
"train_samples_per_second": 13.392,
"train_steps_per_second": 0.105
}
],
"logging_steps": 10,
"max_steps": 7644,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1249712503734272.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}